You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@crunch.apache.org by jw...@apache.org on 2013/04/23 22:41:03 UTC
[01/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Updated Branches:
refs/heads/master cbc7c2fb3 -> 890e0086a
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/writable/WritablesTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/writable/WritablesTest.java b/crunch/src/test/java/org/apache/crunch/types/writable/WritablesTest.java
deleted file mode 100644
index 5396fba..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/writable/WritablesTest.java
+++ /dev/null
@@ -1,256 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertSame;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Collection;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.io.BooleanWritable;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.DoubleWritable;
-import org.apache.hadoop.io.FloatWritable;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class WritablesTest {
-
- @Test
- public void testNulls() throws Exception {
- Void n = null;
- NullWritable nw = NullWritable.get();
- testInputOutputFn(Writables.nulls(), n, nw);
- }
-
- @Test
- public void testStrings() throws Exception {
- String s = "abc";
- Text text = new Text(s);
- testInputOutputFn(Writables.strings(), s, text);
- }
-
- @Test
- public void testInts() throws Exception {
- int j = 55;
- IntWritable w = new IntWritable(j);
- testInputOutputFn(Writables.ints(), j, w);
- }
-
- @Test
- public void testLongs() throws Exception {
- long j = 55;
- LongWritable w = new LongWritable(j);
- testInputOutputFn(Writables.longs(), j, w);
- }
-
- @Test
- public void testFloats() throws Exception {
- float j = 55.5f;
- FloatWritable w = new FloatWritable(j);
- testInputOutputFn(Writables.floats(), j, w);
- }
-
- @Test
- public void testDoubles() throws Exception {
- double j = 55.5d;
- DoubleWritable w = new DoubleWritable(j);
- testInputOutputFn(Writables.doubles(), j, w);
- }
-
- @Test
- public void testBoolean() throws Exception {
- boolean j = false;
- BooleanWritable w = new BooleanWritable(j);
- testInputOutputFn(Writables.booleans(), j, w);
- }
-
- @Test
- public void testBytes() throws Exception {
- byte[] bytes = new byte[] { 17, 26, -98 };
- BytesWritable bw = new BytesWritable(bytes);
- ByteBuffer bb = ByteBuffer.wrap(bytes);
- testInputOutputFn(Writables.bytes(), bb, bw);
- }
-
- @Test
- public void testCollections() throws Exception {
- String s = "abc";
- Collection<String> j = Lists.newArrayList();
- j.add(s);
- GenericArrayWritable<Text> w = new GenericArrayWritable<Text>(Text.class);
- w.set(new Text[] { new Text(s) });
- testInputOutputFn(Writables.collections(Writables.strings()), j, w);
- }
-
- @Test
- public void testPairs() throws Exception {
- Pair<String, String> j = Pair.of("a", "b");
- TupleWritable w = new TupleWritable(new Text[] { new Text("a"), new Text("b"), });
- w.setWritten(0);
- w.setWritten(1);
- testInputOutputFn(Writables.pairs(Writables.strings(), Writables.strings()), j, w);
- }
-
- @Test
- public void testNestedTables() throws Exception {
- PTableType<Long, Long> pll = Writables.tableOf(Writables.longs(), Writables.longs());
- PTableType<Pair<Long, Long>, String> nest = Writables.tableOf(pll, Writables.strings());
- assertNotNull(nest);
- }
-
- @Test
- public void testPairEquals() throws Exception {
- PType<Pair<Long, ByteBuffer>> t1 = Writables.pairs(Writables.longs(), Writables.bytes());
- PType<Pair<Long, ByteBuffer>> t2 = Writables.pairs(Writables.longs(), Writables.bytes());
- assertEquals(t1, t2);
- assertEquals(t1.hashCode(), t2.hashCode());
- }
-
- @Test
- @SuppressWarnings("rawtypes")
- public void testTriples() throws Exception {
- Tuple3 j = Tuple3.of("a", "b", "c");
- TupleWritable w = new TupleWritable(new Text[] { new Text("a"), new Text("b"), new Text("c"), });
- w.setWritten(0);
- w.setWritten(1);
- w.setWritten(2);
- WritableType<?, ?> wt = Writables.triples(Writables.strings(), Writables.strings(), Writables.strings());
- testInputOutputFn(wt, j, w);
- }
-
- @Test
- @SuppressWarnings("rawtypes")
- public void testQuads() throws Exception {
- Tuple4 j = Tuple4.of("a", "b", "c", "d");
- TupleWritable w = new TupleWritable(new Text[] { new Text("a"), new Text("b"), new Text("c"), new Text("d"), });
- w.setWritten(0);
- w.setWritten(1);
- w.setWritten(2);
- w.setWritten(3);
- WritableType<?, ?> wt = Writables.quads(Writables.strings(), Writables.strings(), Writables.strings(),
- Writables.strings());
- testInputOutputFn(wt, j, w);
- }
-
- @Test
- public void testTupleN() throws Exception {
- TupleN j = new TupleN("a", "b", "c", "d", "e");
- TupleWritable w = new TupleWritable(new Text[] { new Text("a"), new Text("b"), new Text("c"), new Text("d"),
- new Text("e"), });
- w.setWritten(0);
- w.setWritten(1);
- w.setWritten(2);
- w.setWritten(3);
- w.setWritten(4);
- WritableType<?, ?> wt = Writables.tuples(Writables.strings(), Writables.strings(), Writables.strings(),
- Writables.strings(), Writables.strings());
- testInputOutputFn(wt, j, w);
- }
-
- protected static class TestWritable implements Writable {
- String left;
- int right;
-
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeUTF(left);
- out.writeInt(right);
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- left = in.readUTF();
- right = in.readInt();
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
- return false;
- TestWritable other = (TestWritable) obj;
- if (left == null) {
- if (other.left != null)
- return false;
- } else if (!left.equals(other.left))
- return false;
- if (right != other.right)
- return false;
- return true;
- }
-
- }
-
- @Test
- public void testRecords() throws Exception {
- TestWritable j = new TestWritable();
- j.left = "a";
- j.right = 1;
- TestWritable w = new TestWritable();
- w.left = "a";
- w.right = 1;
- WritableType<?, ?> wt = Writables.records(TestWritable.class);
- testInputOutputFn(wt, j, w);
- }
-
- @Test
- public void testTableOf() throws Exception {
- Pair<String, String> j = Pair.of("a", "b");
- Pair<Text, Text> w = Pair.of(new Text("a"), new Text("b"));
- WritableTableType<String, String> wtt = Writables.tableOf(Writables.strings(), Writables.strings());
- testInputOutputFn(wtt, j, w);
- }
-
- @Test
- public void testRegister() throws Exception {
- WritableType<TestWritable, TestWritable> wt = Writables.writables(TestWritable.class);
- Writables.register(TestWritable.class, wt);
- assertSame(Writables.records(TestWritable.class), wt);
- }
-
- @SuppressWarnings({ "unchecked", "rawtypes" })
- protected static void testInputOutputFn(PType ptype, Object java, Object writable) {
- ptype.getInputMapFn().initialize();
- ptype.getOutputMapFn().initialize();
- assertEquals(java, ptype.getInputMapFn().map(writable));
- assertEquals(writable, ptype.getOutputMapFn().map(java));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/util/DistCacheTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/util/DistCacheTest.java b/crunch/src/test/java/org/apache/crunch/util/DistCacheTest.java
deleted file mode 100644
index 6784f14..0000000
--- a/crunch/src/test/java/org/apache/crunch/util/DistCacheTest.java
+++ /dev/null
@@ -1,156 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.util;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
-
-public class DistCacheTest {
-
- // A temporary folder used to hold files created for the test.
- @Rule
- public TemporaryFolder testFolder = new TemporaryFolder();
-
- // A configuration and lists of paths to use in tests.
- private Configuration testConf;
- private String[] testFilePaths;
- private String[] testFileQualifiedPaths;
-
- /**
- * Setup resources for tests. These include:
- * <ol>
- * <li>A Hadoop configuration.
- * <li>A directory of temporary files that includes 3 .jar files and 1 other
- * file.
- * <li>Arrays containing the canonical paths and qualified paths to the test
- * files.
- * </ol>
- */
- @Before
- public void setup() throws IOException {
- // Create a configuration for tests.
- testConf = new Configuration();
-
- // Create the test files and add their paths to the list of test file paths.
- testFilePaths = new String[3];
- testFilePaths[0] = testFolder.newFile("jar1.jar").getCanonicalPath();
- testFilePaths[1] = testFolder.newFile("jar2.jar").getCanonicalPath();
- testFilePaths[2] = testFolder.newFile("jar3.jar").getCanonicalPath();
- testFolder.newFile("notJar.other");
-
- // Populate a list of qualified paths from the test file paths.
- testFileQualifiedPaths = new String[3];
- for (int i = 0; i < testFilePaths.length; i++) {
- testFileQualifiedPaths[i] = "file:" + testFilePaths[i];
- }
- }
-
- /**
- * Tests adding jars one-by-one to a job's configuration.
- *
- * @throws IOException
- * If there is a problem adding the jars.
- */
- @Test
- public void testAddJar() throws IOException {
- // Add each valid jar path to the distributed cache configuration, and
- // verify each was
- // added correctly in turn.
- for (int i = 0; i < testFilePaths.length; i++) {
- DistCache.addJarToDistributedCache(testConf, testFilePaths[i]);
- assertEquals("tmpjars configuration var does not contain expected value.",
- StringUtils.join(testFileQualifiedPaths, ",", 0, i + 1), testConf.get("tmpjars"));
- }
- }
-
- /**
- * Tests that attempting to add the path to a jar that does not exist to the
- * configuration throws an exception.
- *
- * @throws IOException
- * If the added jar path does not exist. This exception is expected.
- */
- @Test(expected = IOException.class)
- public void testAddJarThatDoesntExist() throws IOException {
- DistCache.addJarToDistributedCache(testConf, "/garbage/doesntexist.jar");
- }
-
- /**
- * Tests that adding a directory of jars to the configuration works as
- * expected. .jar files under the added directory should be added to the
- * configuration, and all other files should be skipped.
- *
- * @throws IOException
- * If there is a problem adding the jar directory to the
- * configuration.
- */
- @Test
- public void testAddJarDirectory() throws IOException {
- DistCache.addJarDirToDistributedCache(testConf, testFolder.getRoot().getCanonicalPath());
- // Throw the added jar paths in a set to detect duplicates.
- String[] splitJarPaths = StringUtils.split(testConf.get("tmpjars"), ",");
- Set<String> addedJarPaths = new HashSet<String>();
- for (String path : splitJarPaths) {
- addedJarPaths.add(path);
- }
- assertEquals("Incorrect number of jar paths added.", testFilePaths.length, addedJarPaths.size());
-
- // Ensure all expected paths were added.
- for (int i = 0; i < testFileQualifiedPaths.length; i++) {
- assertTrue("Expected jar path missing from jar paths added to tmpjars: " + testFileQualifiedPaths[i],
- addedJarPaths.contains(testFileQualifiedPaths[i]));
- }
- }
-
- /**
- * Tests that adding a jar directory that does not exist to the configuration
- * throws an exception.
- *
- * @throws IOException
- * If the added jar directory does not exist. This exception is
- * expected.
- */
- @Test(expected = IOException.class)
- public void testAddJarDirectoryThatDoesntExist() throws IOException {
- DistCache.addJarDirToDistributedCache(testConf, "/garbage/doesntexist");
- }
-
- /**
- * Tests that adding a jar directory that is not a directory to the
- * configuration throws an exception.
- *
- * @throws IOException
- * If the added jar directory is not a directory. This exception is
- * expected.
- */
- @Test(expected = IOException.class)
- public void testAddJarDirectoryNotDirectory() throws IOException {
- DistCache.addJarDirToDistributedCache(testConf, testFilePaths[0]);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index d5f90f2..71f5e0f 100644
--- a/pom.xml
+++ b/pom.xml
@@ -44,7 +44,7 @@ under the License.
</prerequisites>
<modules>
- <module>crunch</module>
+ <module>crunch-core</module>
<module>crunch-hbase</module>
<module>crunch-test</module>
<module>crunch-contrib</module>
@@ -103,7 +103,7 @@ under the License.
<dependencies>
<dependency>
<groupId>org.apache.crunch</groupId>
- <artifactId>crunch</artifactId>
+ <artifactId>crunch-core</artifactId>
<version>${project.version}</version>
</dependency>
[04/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/writable/Writables.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/writable/Writables.java b/crunch/src/main/java/org/apache/crunch/types/writable/Writables.java
deleted file mode 100644
index 78cf3ae..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/writable/Writables.java
+++ /dev/null
@@ -1,588 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-import org.apache.crunch.fn.CompositeMapFn;
-import org.apache.crunch.fn.IdentityFn;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypes;
-import org.apache.crunch.types.TupleFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.BooleanWritable;
-import org.apache.hadoop.io.BytesWritable;
-import org.apache.hadoop.io.DoubleWritable;
-import org.apache.hadoop.io.FloatWritable;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.MapWritable;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-/**
- * Defines static methods that are analogous to the methods defined in
- * {@link WritableTypeFamily} for convenient static importing.
- *
- */
-public class Writables {
- private static final MapFn<NullWritable, Void> NULL_WRITABLE_TO_VOID = new MapFn<NullWritable, Void>() {
- @Override
- public Void map(NullWritable input) {
- return null;
- }
- };
-
- private static final MapFn<Void, NullWritable> VOID_TO_NULL_WRITABLE = new MapFn<Void, NullWritable>() {
- @Override
- public NullWritable map(Void input) {
- return NullWritable.get();
- }
- };
-
- private static final MapFn<Text, String> TEXT_TO_STRING = new MapFn<Text, String>() {
- @Override
- public String map(Text input) {
- return input.toString();
- }
- };
-
- private static final MapFn<String, Text> STRING_TO_TEXT = new MapFn<String, Text>() {
- @Override
- public Text map(String input) {
- return new Text(input);
- }
- };
-
- private static final MapFn<IntWritable, Integer> IW_TO_INT = new MapFn<IntWritable, Integer>() {
- @Override
- public Integer map(IntWritable input) {
- return input.get();
- }
- };
-
- private static final MapFn<Integer, IntWritable> INT_TO_IW = new MapFn<Integer, IntWritable>() {
- @Override
- public IntWritable map(Integer input) {
- return new IntWritable(input);
- }
- };
-
- private static final MapFn<LongWritable, Long> LW_TO_LONG = new MapFn<LongWritable, Long>() {
- @Override
- public Long map(LongWritable input) {
- return input.get();
- }
- };
-
- private static final MapFn<Long, LongWritable> LONG_TO_LW = new MapFn<Long, LongWritable>() {
- @Override
- public LongWritable map(Long input) {
- return new LongWritable(input);
- }
- };
-
- private static final MapFn<FloatWritable, Float> FW_TO_FLOAT = new MapFn<FloatWritable, Float>() {
- @Override
- public Float map(FloatWritable input) {
- return input.get();
- }
- };
-
- private static final MapFn<Float, FloatWritable> FLOAT_TO_FW = new MapFn<Float, FloatWritable>() {
- @Override
- public FloatWritable map(Float input) {
- return new FloatWritable(input);
- }
- };
-
- private static final MapFn<DoubleWritable, Double> DW_TO_DOUBLE = new MapFn<DoubleWritable, Double>() {
- @Override
- public Double map(DoubleWritable input) {
- return input.get();
- }
- };
-
- private static final MapFn<Double, DoubleWritable> DOUBLE_TO_DW = new MapFn<Double, DoubleWritable>() {
- @Override
- public DoubleWritable map(Double input) {
- return new DoubleWritable(input);
- }
- };
-
- private static final MapFn<BooleanWritable, Boolean> BW_TO_BOOLEAN = new MapFn<BooleanWritable, Boolean>() {
- @Override
- public Boolean map(BooleanWritable input) {
- return input.get();
- }
- };
-
- private static final BooleanWritable TRUE = new BooleanWritable(true);
- private static final BooleanWritable FALSE = new BooleanWritable(false);
- private static final MapFn<Boolean, BooleanWritable> BOOLEAN_TO_BW = new MapFn<Boolean, BooleanWritable>() {
- @Override
- public BooleanWritable map(Boolean input) {
- return input == Boolean.TRUE ? TRUE : FALSE;
- }
- };
-
- private static final MapFn<BytesWritable, ByteBuffer> BW_TO_BB = new MapFn<BytesWritable, ByteBuffer>() {
- @Override
- public ByteBuffer map(BytesWritable input) {
- return ByteBuffer.wrap(input.getBytes(), 0, input.getLength());
- }
- };
-
- private static final MapFn<ByteBuffer, BytesWritable> BB_TO_BW = new MapFn<ByteBuffer, BytesWritable>() {
- @Override
- public BytesWritable map(ByteBuffer input) {
- BytesWritable bw = new BytesWritable();
- bw.set(input.array(), input.arrayOffset(), input.limit());
- return bw;
- }
- };
-
- private static <S, W extends Writable> WritableType<S, W> create(Class<S> typeClass, Class<W> writableClass,
- MapFn<W, S> inputDoFn, MapFn<S, W> outputDoFn) {
- return new WritableType<S, W>(typeClass, writableClass, inputDoFn, outputDoFn);
- }
-
- private static final WritableType<Void, NullWritable> nulls = create(Void.class, NullWritable.class,
- NULL_WRITABLE_TO_VOID, VOID_TO_NULL_WRITABLE);
- private static final WritableType<String, Text> strings = create(String.class, Text.class, TEXT_TO_STRING,
- STRING_TO_TEXT);
- private static final WritableType<Long, LongWritable> longs = create(Long.class, LongWritable.class, LW_TO_LONG,
- LONG_TO_LW);
- private static final WritableType<Integer, IntWritable> ints = create(Integer.class, IntWritable.class, IW_TO_INT,
- INT_TO_IW);
- private static final WritableType<Float, FloatWritable> floats = create(Float.class, FloatWritable.class,
- FW_TO_FLOAT, FLOAT_TO_FW);
- private static final WritableType<Double, DoubleWritable> doubles = create(Double.class, DoubleWritable.class,
- DW_TO_DOUBLE, DOUBLE_TO_DW);
- private static final WritableType<Boolean, BooleanWritable> booleans = create(Boolean.class, BooleanWritable.class,
- BW_TO_BOOLEAN, BOOLEAN_TO_BW);
- private static final WritableType<ByteBuffer, BytesWritable> bytes = create(ByteBuffer.class, BytesWritable.class,
- BW_TO_BB, BB_TO_BW);
-
- private static final Map<Class<?>, PType<?>> PRIMITIVES = ImmutableMap.<Class<?>, PType<?>> builder()
- .put(String.class, strings).put(Long.class, longs).put(Integer.class, ints).put(Float.class, floats)
- .put(Double.class, doubles).put(Boolean.class, booleans).put(ByteBuffer.class, bytes).build();
-
- private static final Map<Class<?>, WritableType<?, ?>> EXTENSIONS = Maps.newHashMap();
-
- public static <T> PType<T> getPrimitiveType(Class<T> clazz) {
- return (PType<T>) PRIMITIVES.get(clazz);
- }
-
- public static <T> void register(Class<T> clazz, WritableType<T, ? extends Writable> ptype) {
- EXTENSIONS.put(clazz, ptype);
- }
-
- public static final WritableType<Void, NullWritable> nulls() {
- return nulls;
- }
-
- public static final WritableType<String, Text> strings() {
- return strings;
- }
-
- public static final WritableType<Long, LongWritable> longs() {
- return longs;
- }
-
- public static final WritableType<Integer, IntWritable> ints() {
- return ints;
- }
-
- public static final WritableType<Float, FloatWritable> floats() {
- return floats;
- }
-
- public static final WritableType<Double, DoubleWritable> doubles() {
- return doubles;
- }
-
- public static final WritableType<Boolean, BooleanWritable> booleans() {
- return booleans;
- }
-
- public static final WritableType<ByteBuffer, BytesWritable> bytes() {
- return bytes;
- }
-
- public static final <T, W extends Writable> WritableType<T, W> records(Class<T> clazz) {
- if (EXTENSIONS.containsKey(clazz)) {
- return (WritableType<T, W>) EXTENSIONS.get(clazz);
- }
- if (Writable.class.isAssignableFrom(clazz)) {
- return (WritableType<T, W>) writables(clazz.asSubclass(Writable.class));
- } else {
- throw new IllegalArgumentException(
- "Cannot create Writable records from non-Writable class"+ clazz.getCanonicalName());
- }
- }
-
- public static <W extends Writable> WritableType<W, W> writables(Class<W> clazz) {
- MapFn wIdentity = IdentityFn.getInstance();
- return new WritableType<W, W>(clazz, clazz, wIdentity, wIdentity);
- }
-
- public static <K, V> WritableTableType<K, V> tableOf(PType<K> key, PType<V> value) {
- if (key instanceof WritableTableType) {
- WritableTableType wtt = (WritableTableType) key;
- key = pairs(wtt.getKeyType(), wtt.getValueType());
- } else if (!(key instanceof WritableType)) {
- throw new IllegalArgumentException("Key type must be of class WritableType");
- }
- if (value instanceof WritableTableType) {
- WritableTableType wtt = (WritableTableType) value;
- value = pairs(wtt.getKeyType(), wtt.getValueType());
- } else if (!(value instanceof WritableType)) {
- throw new IllegalArgumentException("Value type must be of class WritableType");
- }
- return new WritableTableType((WritableType) key, (WritableType) value);
- }
-
- /**
- * For mapping from {@link TupleWritable} instances to {@link Tuple}s.
- *
- */
- private static class TWTupleMapFn extends MapFn<TupleWritable, Tuple> {
- private final TupleFactory<?> tupleFactory;
- private final List<MapFn> fns;
-
- private transient Object[] values;
-
- public TWTupleMapFn(TupleFactory<?> tupleFactory, PType<?>... ptypes) {
- this.tupleFactory = tupleFactory;
- this.fns = Lists.newArrayList();
- for (PType ptype : ptypes) {
- fns.add(ptype.getInputMapFn());
- }
- }
-
- @Override
- public void configure(Configuration conf) {
- for (MapFn fn : fns) {
- fn.configure(conf);
- }
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- for (MapFn fn : fns) {
- fn.setContext(context);
- }
- }
-
- @Override
- public void initialize() {
- for (MapFn fn : fns) {
- fn.initialize();
- }
- // The rest of the methods allocate new
- // objects each time. However this one
- // uses Tuple.tuplify which does a copy
- this.values = new Object[fns.size()];
- tupleFactory.initialize();
- }
-
- @Override
- public Tuple map(TupleWritable in) {
- for (int i = 0; i < values.length; i++) {
- if (in.has(i)) {
- values[i] = fns.get(i).map(in.get(i));
- } else {
- values[i] = null;
- }
- }
- return tupleFactory.makeTuple(values);
- }
- }
-
- /**
- * For mapping from {@code Tuple}s to {@code TupleWritable}s.
- *
- */
- private static class TupleTWMapFn extends MapFn<Tuple, TupleWritable> {
-
- private transient TupleWritable writable;
- private transient Writable[] values;
-
- private final List<MapFn> fns;
-
- public TupleTWMapFn(PType<?>... ptypes) {
- this.fns = Lists.newArrayList();
- for (PType<?> ptype : ptypes) {
- fns.add(ptype.getOutputMapFn());
- }
- }
-
- @Override
- public void configure(Configuration conf) {
- for (MapFn fn : fns) {
- fn.configure(conf);
- }
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- for (MapFn fn : fns) {
- fn.setContext(context);
- }
- }
-
- @Override
- public void initialize() {
- this.values = new Writable[fns.size()];
- this.writable = new TupleWritable(values);
- for (MapFn fn : fns) {
- fn.initialize();
- }
- }
-
- @Override
- public TupleWritable map(Tuple input) {
- writable.clearWritten();
- for (int i = 0; i < input.size(); i++) {
- Object value = input.get(i);
- if (value != null) {
- writable.setWritten(i);
- values[i] = (Writable) fns.get(i).map(value);
- }
- }
- return writable;
- }
- }
-
- public static <V1, V2> WritableType<Pair<V1, V2>, TupleWritable> pairs(PType<V1> p1, PType<V2> p2) {
- TWTupleMapFn input = new TWTupleMapFn(TupleFactory.PAIR, p1, p2);
- TupleTWMapFn output = new TupleTWMapFn(p1, p2);
- return new WritableType(Pair.class, TupleWritable.class, input, output, p1, p2);
- }
-
- public static <V1, V2, V3> WritableType<Tuple3<V1, V2, V3>, TupleWritable> triples(PType<V1> p1, PType<V2> p2,
- PType<V3> p3) {
- TWTupleMapFn input = new TWTupleMapFn(TupleFactory.TUPLE3, p1, p2, p3);
- TupleTWMapFn output = new TupleTWMapFn(p1, p2, p3);
- return new WritableType(Tuple3.class, TupleWritable.class, input, output, p1, p2, p3);
- }
-
- public static <V1, V2, V3, V4> WritableType<Tuple4<V1, V2, V3, V4>, TupleWritable> quads(PType<V1> p1, PType<V2> p2,
- PType<V3> p3, PType<V4> p4) {
- TWTupleMapFn input = new TWTupleMapFn(TupleFactory.TUPLE4, p1, p2, p3, p4);
- TupleTWMapFn output = new TupleTWMapFn(p1, p2, p3, p4);
- return new WritableType(Tuple4.class, TupleWritable.class, input, output, p1, p2, p3, p4);
- }
-
- public static WritableType<TupleN, TupleWritable> tuples(PType... ptypes) {
- TWTupleMapFn input = new TWTupleMapFn(TupleFactory.TUPLEN, ptypes);
- TupleTWMapFn output = new TupleTWMapFn(ptypes);
- return new WritableType(TupleN.class, TupleWritable.class, input, output, ptypes);
- }
-
- public static <T extends Tuple> PType<T> tuples(Class<T> clazz, PType... ptypes) {
- Class[] typeArgs = new Class[ptypes.length];
- for (int i = 0; i < typeArgs.length; i++) {
- typeArgs[i] = ptypes[i].getTypeClass();
- }
- TupleFactory<T> factory = TupleFactory.create(clazz, typeArgs);
- TWTupleMapFn input = new TWTupleMapFn(factory, ptypes);
- TupleTWMapFn output = new TupleTWMapFn(ptypes);
- return new WritableType(clazz, TupleWritable.class, input, output, ptypes);
- }
-
- public static <S, T> PType<T> derived(Class<T> clazz, MapFn<S, T> inputFn, MapFn<T, S> outputFn, PType<S> base) {
- WritableType<S, ?> wt = (WritableType<S, ?>) base;
- MapFn input = new CompositeMapFn(wt.getInputMapFn(), inputFn);
- MapFn output = new CompositeMapFn(outputFn, wt.getOutputMapFn());
- return new WritableType(clazz, wt.getSerializationClass(), input, output, base.getSubTypes().toArray(new PType[0]));
- }
-
- private static class ArrayCollectionMapFn<T> extends MapFn<GenericArrayWritable, Collection<T>> {
- private final MapFn<Object, T> mapFn;
-
- public ArrayCollectionMapFn(MapFn<Object, T> mapFn) {
- this.mapFn = mapFn;
- }
-
- @Override
- public void configure(Configuration conf) {
- mapFn.configure(conf);
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- mapFn.setContext(context);
- }
-
- @Override
- public void initialize() {
- mapFn.initialize();
- }
-
- @Override
- public Collection<T> map(GenericArrayWritable input) {
- Collection<T> collection = Lists.newArrayList();
- for (Writable writable : input.get()) {
- collection.add(mapFn.map(writable));
- }
- return collection;
- }
- }
-
- private static class CollectionArrayMapFn<T> extends MapFn<Collection<T>, GenericArrayWritable> {
-
- private final Class<? extends Writable> clazz;
- private final MapFn<T, Object> mapFn;
-
- public CollectionArrayMapFn(Class<? extends Writable> clazz, MapFn<T, Object> mapFn) {
- this.clazz = clazz;
- this.mapFn = mapFn;
- }
-
- @Override
- public void configure(Configuration conf) {
- mapFn.configure(conf);
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- mapFn.setContext(context);
- }
-
- @Override
- public void initialize() {
- mapFn.initialize();
- }
-
- @Override
- public GenericArrayWritable map(Collection<T> input) {
- GenericArrayWritable arrayWritable = new GenericArrayWritable(clazz);
- Writable[] w = new Writable[input.size()];
- int index = 0;
- for (T in : input) {
- w[index++] = ((Writable) mapFn.map(in));
- }
- arrayWritable.set(w);
- return arrayWritable;
- }
- }
-
- public static <T> WritableType<Collection<T>, GenericArrayWritable<T>> collections(PType<T> ptype) {
- WritableType<T, ?> wt = (WritableType<T, ?>) ptype;
- return new WritableType(Collection.class, GenericArrayWritable.class, new ArrayCollectionMapFn(wt.getInputMapFn()),
- new CollectionArrayMapFn(wt.getSerializationClass(), wt.getOutputMapFn()), ptype);
- }
-
- private static class MapInputMapFn<T> extends MapFn<TextMapWritable<Writable>, Map<String, T>> {
- private final MapFn<Writable, T> mapFn;
-
- public MapInputMapFn(MapFn<Writable, T> mapFn) {
- this.mapFn = mapFn;
- }
-
- @Override
- public void configure(Configuration conf) {
- mapFn.configure(conf);
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- mapFn.setContext(context);
- }
-
- @Override
- public void initialize() {
- mapFn.initialize();
- }
-
- @Override
- public Map<String, T> map(TextMapWritable<Writable> input) {
- Map<String, T> out = Maps.newHashMap();
- for (Map.Entry<Text, Writable> e : input.entrySet()) {
- out.put(e.getKey().toString(), mapFn.map(e.getValue()));
- }
- return out;
- }
- }
-
- private static class MapOutputMapFn<T> extends MapFn<Map<String, T>, TextMapWritable<Writable>> {
-
- private final Class<Writable> clazz;
- private final MapFn<T, Writable> mapFn;
-
- public MapOutputMapFn(Class<Writable> clazz, MapFn<T, Writable> mapFn) {
- this.clazz = clazz;
- this.mapFn = mapFn;
- }
-
- @Override
- public void configure(Configuration conf) {
- mapFn.configure(conf);
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- mapFn.setContext(context);
- }
-
- @Override
- public void initialize() {
- mapFn.initialize();
- }
-
- @Override
- public TextMapWritable<Writable> map(Map<String, T> input) {
- TextMapWritable<Writable> tmw = new TextMapWritable<Writable>(clazz);
- for (Map.Entry<String, T> e : input.entrySet()) {
- tmw.put(new Text(e.getKey()), mapFn.map(e.getValue()));
- }
- return tmw;
- }
- }
-
- public static <T> WritableType<Map<String, T>, MapWritable> maps(PType<T> ptype) {
- WritableType<T, ?> wt = (WritableType<T, ?>) ptype;
- return new WritableType(Map.class, TextMapWritable.class, new MapInputMapFn(wt.getInputMapFn()),
- new MapOutputMapFn(wt.getSerializationClass(), wt.getOutputMapFn()), ptype);
- }
-
- public static <T> PType<T> jsons(Class<T> clazz) {
- return PTypes.jsonString(clazz, WritableTypeFamily.getInstance());
- }
-
- // Not instantiable
- private Writables() {
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/writable/package-info.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/writable/package-info.java b/crunch/src/main/java/org/apache/crunch/types/writable/package-info.java
deleted file mode 100644
index 7d54743..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/writable/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Business object serialization using Hadoop's Writables framework.
- */
-package org.apache.crunch.types.writable;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/util/CrunchTool.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/util/CrunchTool.java b/crunch/src/main/java/org/apache/crunch/util/CrunchTool.java
deleted file mode 100644
index ea66291..0000000
--- a/crunch/src/main/java/org/apache/crunch/util/CrunchTool.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.util;
-
-import java.io.Serializable;
-
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.PipelineExecution;
-import org.apache.crunch.PipelineResult;
-import org.apache.crunch.Source;
-import org.apache.crunch.TableSource;
-import org.apache.crunch.Target;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.At;
-import org.apache.crunch.io.From;
-import org.apache.crunch.io.To;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.util.Tool;
-
-/**
- * An extension of the {@code Tool} interface that creates a {@code Pipeline}
- * instance and provides methods for working with the Pipeline from inside of
- * the Tool's run method.
- *
- */
-public abstract class CrunchTool extends Configured implements Tool, Serializable {
-
- protected static final From from = new From();
- protected static final To to = new To();
- protected static final At at = new At();
-
- // Pipeline object itself isn't necessarily serializable.
- private transient Pipeline pipeline;
-
- public CrunchTool() {
- this(false);
- }
-
- public CrunchTool(boolean inMemory) {
- this.pipeline = inMemory ? MemPipeline.getInstance() : new MRPipeline(getClass());
- }
-
- @Override
- public void setConf(Configuration conf) {
- super.setConf(conf);
- if (conf != null && pipeline != null) {
- pipeline.setConfiguration(conf);
- }
- }
-
- @Override
- public Configuration getConf() {
- return pipeline.getConfiguration();
- }
-
- public void enableDebug() {
- pipeline.enableDebug();
- }
-
- public <T> PCollection<T> read(Source<T> source) {
- return pipeline.read(source);
- }
-
- public <K, V> PTable<K, V> read(TableSource<K, V> tableSource) {
- return pipeline.read(tableSource);
- }
-
- public PCollection<String> readTextFile(String pathName) {
- return pipeline.readTextFile(pathName);
- }
-
- public void write(PCollection<?> pcollection, Target target) {
- pipeline.write(pcollection, target);
- }
-
- public void writeTextFile(PCollection<?> pcollection, String pathName) {
- pipeline.writeTextFile(pcollection, pathName);
- }
-
- public <T> Iterable<T> materialize(PCollection<T> pcollection) {
- return pipeline.materialize(pcollection);
- }
-
- public PipelineResult run() {
- return pipeline.run();
- }
-
- public PipelineExecution runAsync() {
- return pipeline.runAsync();
- }
-
- public PipelineResult done() {
- return pipeline.done();
- }
-
- protected Pipeline getPipeline() {
- return pipeline;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/util/DistCache.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/util/DistCache.java b/crunch/src/main/java/org/apache/crunch/util/DistCache.java
deleted file mode 100644
index 3e49930..0000000
--- a/crunch/src/main/java/org/apache/crunch/util/DistCache.java
+++ /dev/null
@@ -1,231 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.util;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
-import java.net.URI;
-import java.net.URL;
-import java.net.URLDecoder;
-import java.util.Enumeration;
-
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.filecache.DistributedCache;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-
-/**
- * Provides functions for working with Hadoop's distributed cache. These
- * include:
- * <ul>
- * <li>
- * Functions for working with a job-specific distributed cache of objects, like
- * the serialized runtime nodes in a MapReduce.</li>
- * <li>
- * Functions for adding library jars to the distributed cache, which will be
- * added to the classpath of MapReduce tasks.</li>
- * </ul>
- */
-public class DistCache {
-
- // Configuration key holding the paths of jars to export to the distributed
- // cache.
- private static final String TMPJARS_KEY = "tmpjars";
-
- public static void write(Configuration conf, Path path, Object value) throws IOException {
- ObjectOutputStream oos = new ObjectOutputStream(path.getFileSystem(conf).create(path));
- oos.writeObject(value);
- oos.close();
-
- DistributedCache.addCacheFile(path.toUri(), conf);
- }
-
- public static Object read(Configuration conf, Path path) throws IOException {
- URI target = null;
- for (URI uri : DistributedCache.getCacheFiles(conf)) {
- if (uri.toString().equals(path.toString())) {
- target = uri;
- break;
- }
- }
- Object value = null;
- if (target != null) {
- Path targetPath = new Path(target.toString());
- ObjectInputStream ois = new ObjectInputStream(targetPath.getFileSystem(conf).open(targetPath));
- try {
- value = ois.readObject();
- } catch (ClassNotFoundException e) {
- throw new CrunchRuntimeException(e);
- }
- ois.close();
- }
- return value;
- }
-
- public static void addCacheFile(Path path, Configuration conf) {
- DistributedCache.addCacheFile(path.toUri(), conf);
- }
-
- public static Path getPathToCacheFile(Path path, Configuration conf) {
- try {
- for (Path localPath : DistributedCache.getLocalCacheFiles(conf)) {
- if (localPath.toString().endsWith(path.getName())) {
- return localPath.makeQualified(FileSystem.getLocal(conf));
- }
- }
- } catch (IOException e) {
- throw new CrunchRuntimeException(e);
- }
- return null;
- }
-
- /**
- * Adds the specified jar to the distributed cache of jobs using the provided
- * configuration. The jar will be placed on the classpath of tasks run by the
- * job.
- *
- * @param conf
- * The configuration used to add the jar to the distributed cache.
- * @param jarFile
- * The jar file to add to the distributed cache.
- * @throws IOException
- * If the jar file does not exist or there is a problem accessing
- * the file.
- */
- public static void addJarToDistributedCache(Configuration conf, File jarFile) throws IOException {
- if (!jarFile.exists()) {
- throw new IOException("Jar file: " + jarFile.getCanonicalPath() + " does not exist.");
- }
- if (!jarFile.getName().endsWith(".jar")) {
- throw new IllegalArgumentException("File: " + jarFile.getCanonicalPath() + " is not a .jar " + "file.");
- }
- // Get a qualified path for the jar.
- FileSystem fileSystem = FileSystem.getLocal(conf);
- Path jarPath = new Path(jarFile.getCanonicalPath());
- String qualifiedPath = jarPath.makeQualified(fileSystem).toString();
- // Add the jar to the configuration variable.
- String jarConfiguration = conf.get(TMPJARS_KEY, "");
- if (!jarConfiguration.isEmpty()) {
- jarConfiguration += ",";
- }
- jarConfiguration += qualifiedPath;
- conf.set(TMPJARS_KEY, jarConfiguration);
- }
-
- /**
- * Adds the jar at the specified path to the distributed cache of jobs using
- * the provided configuration. The jar will be placed on the classpath of
- * tasks run by the job.
- *
- * @param conf
- * The configuration used to add the jar to the distributed cache.
- * @param jarFile
- * The path to the jar file to add to the distributed cache.
- * @throws IOException
- * If the jar file does not exist or there is a problem accessing
- * the file.
- */
- public static void addJarToDistributedCache(Configuration conf, String jarFile) throws IOException {
- addJarToDistributedCache(conf, new File(jarFile));
- }
-
- /**
- * Finds the path to a jar that contains the class provided, if any. There is
- * no guarantee that the jar returned will be the first on the classpath to
- * contain the file. This method is basically lifted out of Hadoop's
- * {@link org.apache.hadoop.mapred.JobConf} class.
- *
- * @param jarClass
- * The class the jar file should contain.
- * @return The path to a jar file that contains the class, or
- * <code>null</code> if no such jar exists.
- * @throws IOException
- * If there is a problem searching for the jar file.
- */
- public static String findContainingJar(Class<?> jarClass) throws IOException {
- ClassLoader loader = jarClass.getClassLoader();
- String classFile = jarClass.getName().replaceAll("\\.", "/") + ".class";
- for (Enumeration<URL> itr = loader.getResources(classFile); itr.hasMoreElements();) {
- URL url = itr.nextElement();
- if ("jar".equals(url.getProtocol())) {
- String toReturn = url.getPath();
- if (toReturn.startsWith("file:")) {
- toReturn = toReturn.substring("file:".length());
- }
- // URLDecoder is a misnamed class, since it actually decodes
- // x-www-form-urlencoded MIME type rather than actual
- // URL encoding (which the file path has). Therefore it would
- // decode +s to ' 's which is incorrect (spaces are actually
- // either unencoded or encoded as "%20"). Replace +s first, so
- // that they are kept sacred during the decoding process.
- toReturn = toReturn.replaceAll("\\+", "%2B");
- toReturn = URLDecoder.decode(toReturn, "UTF-8");
- return toReturn.replaceAll("!.*$", "");
- }
- }
- return null;
- }
-
- /**
- * Adds all jars under the specified directory to the distributed cache of
- * jobs using the provided configuration. The jars will be placed on the
- * classpath of tasks run by the job. This method does not descend into
- * subdirectories when adding jars.
- *
- * @param conf
- * The configuration used to add jars to the distributed cache.
- * @param jarDirectory
- * A directory containing jar files to add to the distributed cache.
- * @throws IOException
- * If the directory does not exist or there is a problem accessing
- * the directory.
- */
- public static void addJarDirToDistributedCache(Configuration conf, File jarDirectory) throws IOException {
- if (!jarDirectory.exists() || !jarDirectory.isDirectory()) {
- throw new IOException("Jar directory: " + jarDirectory.getCanonicalPath() + " does not "
- + "exist or is not a directory.");
- }
- for (File file : jarDirectory.listFiles()) {
- if (!file.isDirectory() && file.getName().endsWith(".jar")) {
- addJarToDistributedCache(conf, file);
- }
- }
- }
-
- /**
- * Adds all jars under the directory at the specified path to the distributed
- * cache of jobs using the provided configuration. The jars will be placed on
- * the classpath of the tasks run by the job. This method does not descend
- * into subdirectories when adding jars.
- *
- * @param conf
- * The configuration used to add jars to the distributed cache.
- * @param jarDirectory
- * The path to a directory containing jar files to add to the
- * distributed cache.
- * @throws IOException
- * If the directory does not exist or there is a problem accessing
- * the directory.
- */
- public static void addJarDirToDistributedCache(Configuration conf, String jarDirectory) throws IOException {
- addJarDirToDistributedCache(conf, new File(jarDirectory));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/util/PartitionUtils.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/util/PartitionUtils.java b/crunch/src/main/java/org/apache/crunch/util/PartitionUtils.java
deleted file mode 100644
index da8db6b..0000000
--- a/crunch/src/main/java/org/apache/crunch/util/PartitionUtils.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.util;
-
-import org.apache.crunch.PCollection;
-import org.apache.hadoop.conf.Configuration;
-
-/**
- *
- */
-public class PartitionUtils {
- public static final String BYTES_PER_REDUCE_TASK = "crunch.bytes.per.reduce.task";
- public static final long DEFAULT_BYTES_PER_REDUCE_TASK = 1000L * 1000L * 1000L;
-
- public static <T> int getRecommendedPartitions(PCollection<T> pcollection, Configuration conf) {
- long bytesPerTask = conf.getLong(BYTES_PER_REDUCE_TASK, DEFAULT_BYTES_PER_REDUCE_TASK);
- return 1 + (int) (pcollection.getSize() / bytesPerTask);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/util/Tuples.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/util/Tuples.java b/crunch/src/main/java/org/apache/crunch/util/Tuples.java
deleted file mode 100644
index 9c8d7bd..0000000
--- a/crunch/src/main/java/org/apache/crunch/util/Tuples.java
+++ /dev/null
@@ -1,150 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.util;
-
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-
-import com.google.common.collect.Lists;
-import com.google.common.collect.UnmodifiableIterator;
-
-/**
- * Utilities for working with subclasses of the {@code Tuple} interface.
- *
- */
-public class Tuples {
-
- private static abstract class TuplifyIterator<T> extends UnmodifiableIterator<T> {
- protected List<Iterator<?>> iterators;
-
- public TuplifyIterator(Iterator<?>... iterators) {
- this.iterators = Lists.newArrayList(iterators);
- }
-
- @Override
- public boolean hasNext() {
- for (Iterator<?> iter : iterators) {
- if (!iter.hasNext()) {
- return false;
- }
- }
- return true;
- }
-
- protected Object next(int index) {
- return iterators.get(index).next();
- }
- }
-
- public static class PairIterable<S, T> implements Iterable<Pair<S, T>> {
- private final Iterable<S> first;
- private final Iterable<T> second;
-
- public PairIterable(Iterable<S> first, Iterable<T> second) {
- this.first = first;
- this.second = second;
- }
-
- @Override
- public Iterator<Pair<S, T>> iterator() {
- return new TuplifyIterator<Pair<S, T>>(first.iterator(), second.iterator()) {
- @Override
- public Pair<S, T> next() {
- return Pair.of((S) next(0), (T) next(1));
- }
- };
- }
- }
-
- public static class TripIterable<A, B, C> implements Iterable<Tuple3<A, B, C>> {
- private final Iterable<A> first;
- private final Iterable<B> second;
- private final Iterable<C> third;
-
- public TripIterable(Iterable<A> first, Iterable<B> second, Iterable<C> third) {
- this.first = first;
- this.second = second;
- this.third = third;
- }
-
- @Override
- public Iterator<Tuple3<A, B, C>> iterator() {
- return new TuplifyIterator<Tuple3<A, B, C>>(first.iterator(), second.iterator(), third.iterator()) {
- @Override
- public Tuple3<A, B, C> next() {
- return new Tuple3<A, B, C>((A) next(0), (B) next(1), (C) next(2));
- }
- };
- }
- }
-
- public static class QuadIterable<A, B, C, D> implements Iterable<Tuple4<A, B, C, D>> {
- private final Iterable<A> first;
- private final Iterable<B> second;
- private final Iterable<C> third;
- private final Iterable<D> fourth;
-
- public QuadIterable(Iterable<A> first, Iterable<B> second, Iterable<C> third, Iterable<D> fourth) {
- this.first = first;
- this.second = second;
- this.third = third;
- this.fourth = fourth;
- }
-
- @Override
- public Iterator<Tuple4<A, B, C, D>> iterator() {
- return new TuplifyIterator<Tuple4<A, B, C, D>>(first.iterator(), second.iterator(), third.iterator(),
- fourth.iterator()) {
- @Override
- public Tuple4<A, B, C, D> next() {
- return new Tuple4<A, B, C, D>((A) next(0), (B) next(1), (C) next(2), (D) next(3));
- }
- };
- }
- }
-
- public static class TupleNIterable implements Iterable<TupleN> {
- private final Iterator<?>[] iters;
-
- public TupleNIterable(Iterable<?>... iterables) {
- this.iters = new Iterator[iterables.length];
- for (int i = 0; i < iters.length; i++) {
- iters[i] = iterables[i].iterator();
- }
- }
-
- @Override
- public Iterator<TupleN> iterator() {
- return new TuplifyIterator<TupleN>(iters) {
- @Override
- public TupleN next() {
- Object[] values = new Object[iters.length];
- for (int i = 0; i < values.length; i++) {
- values[i] = next(i);
- }
- return new TupleN(values);
- }
- };
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/util/package-info.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/util/package-info.java b/crunch/src/main/java/org/apache/crunch/util/package-info.java
deleted file mode 100644
index 94d79a1..0000000
--- a/crunch/src/main/java/org/apache/crunch/util/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * An assorted set of utilities.
- */
-package org.apache.crunch.util;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/crunch/src/main/resources/log4j.properties b/crunch/src/main/resources/log4j.properties
deleted file mode 100644
index 506b527..0000000
--- a/crunch/src/main/resources/log4j.properties
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# ***** Set root logger level to INFO and its only appender to A.
-log4j.logger.org.apache.crunch=info, A
-
-# ***** A is set to be a ConsoleAppender.
-log4j.appender.A=org.apache.log4j.ConsoleAppender
-# ***** A uses PatternLayout.
-log4j.appender.A.layout=org.apache.log4j.PatternLayout
-log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/site/site.xml
----------------------------------------------------------------------
diff --git a/crunch/src/site/site.xml b/crunch/src/site/site.xml
deleted file mode 100644
index 73fbd17..0000000
--- a/crunch/src/site/site.xml
+++ /dev/null
@@ -1,34 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="${project.name}"
- xmlns="http://maven.apache.org/DECORATION/1.3.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/DECORATION/1.3.0
- http://maven.apache.org/xsd/decoration-1.3.0.xsd">
-
- <body>
- <!-- Note: Breadcrumbs for Doxia's Markdown parser are currently broken,
- see https://jira.codehaus.org/browse/DOXIA-472 -->
- <breadcrumbs>
- <item name="Apache" href="http://www.apache.org/index.html" />
- <item name="Crunch" href="../index.html"/>
- </breadcrumbs>
-
- </body>
-
-</project>
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/avro/employee.avsc
----------------------------------------------------------------------
diff --git a/crunch/src/test/avro/employee.avsc b/crunch/src/test/avro/employee.avsc
deleted file mode 100644
index 35726e1..0000000
--- a/crunch/src/test/avro/employee.avsc
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-{
-"namespace": "org.apache.crunch.test",
-"name": "Employee",
-"type": "record",
-"fields": [
- {"name": "name", "type": ["string", "null"] },
- {"name": "salary", "type": "int"},
- {"name": "department", "type": ["string", "null"] } ]
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/avro/person.avsc
----------------------------------------------------------------------
diff --git a/crunch/src/test/avro/person.avsc b/crunch/src/test/avro/person.avsc
deleted file mode 100644
index babd808..0000000
--- a/crunch/src/test/avro/person.avsc
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-{
-"namespace": "org.apache.crunch.test",
-"name": "Person",
-"type": "record",
-"fields": [
- {"name": "name", "type": ["string", "null"] },
- {"name": "age", "type": "int"},
- {"name": "siblingnames", "type": {"type": "array", "items": "string"}} ]
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/AndFnTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/AndFnTest.java b/crunch/src/test/java/org/apache/crunch/AndFnTest.java
deleted file mode 100644
index 4b00874..0000000
--- a/crunch/src/test/java/org/apache/crunch/AndFnTest.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.when;
-
-import org.apache.crunch.FilterFn.AndFn;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-import org.junit.Before;
-import org.junit.Test;
-
-public class AndFnTest {
-
- private FilterFn<Integer> fnA;
- private FilterFn<Integer> fnB;
- private AndFn<Integer> andFn;
-
- @Before
- public void setUp() {
- fnA = mock(FilterFn.class);
- fnB = mock(FilterFn.class);
- andFn = new AndFn(fnA, fnB);
- }
-
- @Test
- public void testSetContext() {
- TaskInputOutputContext<?, ?, ?, ?> context = mock(TaskInputOutputContext.class);
- andFn.setContext(context);
-
- verify(fnA).setContext(context);
- verify(fnB).setContext(context);
- }
-
- @Test
- public void testAccept_False() {
- when(fnA.accept(1)).thenReturn(true);
- when(fnB.accept(1)).thenReturn(false);
-
- assertFalse(andFn.accept(1));
- }
-
- @Test
- public void testAccept_True() {
- when(fnA.accept(1)).thenReturn(true);
- when(fnB.accept(1)).thenReturn(true);
-
- assertTrue(andFn.accept(1));
- }
-
- @Test
- public void testCleanup() {
- andFn.cleanup(mock(Emitter.class));
-
- verify(fnA).cleanup();
- verify(fnB).cleanup();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/CombineFnTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/CombineFnTest.java b/crunch/src/test/java/org/apache/crunch/CombineFnTest.java
deleted file mode 100644
index 39548e2..0000000
--- a/crunch/src/test/java/org/apache/crunch/CombineFnTest.java
+++ /dev/null
@@ -1,222 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.apache.crunch.CombineFn.MAX_BIGINTS;
-import static org.apache.crunch.CombineFn.MAX_DOUBLES;
-import static org.apache.crunch.CombineFn.MAX_FLOATS;
-import static org.apache.crunch.CombineFn.MAX_INTS;
-import static org.apache.crunch.CombineFn.MAX_LONGS;
-import static org.apache.crunch.CombineFn.MIN_BIGINTS;
-import static org.apache.crunch.CombineFn.MIN_DOUBLES;
-import static org.apache.crunch.CombineFn.MIN_FLOATS;
-import static org.apache.crunch.CombineFn.MIN_INTS;
-import static org.apache.crunch.CombineFn.MIN_LONGS;
-import static org.apache.crunch.CombineFn.SUM_BIGINTS;
-import static org.apache.crunch.CombineFn.SUM_DOUBLES;
-import static org.apache.crunch.CombineFn.SUM_FLOATS;
-import static org.apache.crunch.CombineFn.SUM_INTS;
-import static org.apache.crunch.CombineFn.SUM_LONGS;
-import static org.junit.Assert.assertEquals;
-
-import java.math.BigInteger;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.crunch.CombineFn.Aggregator;
-import org.apache.crunch.CombineFn.AggregatorFactory;
-import org.apache.crunch.CombineFn.FirstNAggregator;
-import org.apache.crunch.CombineFn.LastNAggregator;
-import org.apache.crunch.CombineFn.MaxNAggregator;
-import org.apache.crunch.CombineFn.MinNAggregator;
-import org.apache.crunch.CombineFn.PairAggregator;
-import org.apache.crunch.CombineFn.QuadAggregator;
-import org.apache.crunch.CombineFn.StringConcatAggregator;
-import org.apache.crunch.CombineFn.TripAggregator;
-import org.apache.crunch.CombineFn.TupleNAggregator;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Iterables;
-
-public class CombineFnTest {
-
- private <T> Iterable<T> applyAggregator(AggregatorFactory<T> a, Iterable<T> values) {
- return applyAggregator(a.create(), values);
- }
-
- private <T> Iterable<T> applyAggregator(Aggregator<T> a, Iterable<T> values) {
- a.reset();
- for (T value : values) {
- a.update(value);
- }
- return a.results();
- }
-
- @Test
- public void testSums() {
- assertEquals(ImmutableList.of(1775L), applyAggregator(SUM_LONGS, ImmutableList.of(29L, 17L, 1729L)));
-
- assertEquals(ImmutableList.of(1765L), applyAggregator(SUM_LONGS, ImmutableList.of(29L, 7L, 1729L)));
-
- assertEquals(ImmutableList.of(1775), applyAggregator(SUM_INTS, ImmutableList.of(29, 17, 1729)));
-
- assertEquals(ImmutableList.of(1775.0f), applyAggregator(SUM_FLOATS, ImmutableList.of(29f, 17f, 1729f)));
-
- assertEquals(ImmutableList.of(1775.0), applyAggregator(SUM_DOUBLES, ImmutableList.of(29.0, 17.0, 1729.0)));
-
- assertEquals(
- ImmutableList.of(new BigInteger("1775")),
- applyAggregator(SUM_BIGINTS,
- ImmutableList.of(new BigInteger("29"), new BigInteger("17"), new BigInteger("1729"))));
- }
-
- @Test
- public void testMax() {
- assertEquals(ImmutableList.of(1729L), applyAggregator(MAX_LONGS, ImmutableList.of(29L, 17L, 1729L)));
-
- assertEquals(ImmutableList.of(1729), applyAggregator(MAX_INTS, ImmutableList.of(29, 17, 1729)));
-
- assertEquals(ImmutableList.of(1729.0f), applyAggregator(MAX_FLOATS, ImmutableList.of(29f, 17f, 1729f)));
-
- assertEquals(ImmutableList.of(1729.0), applyAggregator(MAX_DOUBLES, ImmutableList.of(29.0, 17.0, 1729.0)));
-
- assertEquals(ImmutableList.of(1745.0f), applyAggregator(MAX_FLOATS, ImmutableList.of(29f, 1745f, 17f, 1729f)));
-
- assertEquals(
- ImmutableList.of(new BigInteger("1729")),
- applyAggregator(MAX_BIGINTS,
- ImmutableList.of(new BigInteger("29"), new BigInteger("17"), new BigInteger("1729"))));
- }
-
- @Test
- public void testMin() {
- assertEquals(ImmutableList.of(17L), applyAggregator(MIN_LONGS, ImmutableList.of(29L, 17L, 1729L)));
-
- assertEquals(ImmutableList.of(17), applyAggregator(MIN_INTS, ImmutableList.of(29, 17, 1729)));
-
- assertEquals(ImmutableList.of(17.0f), applyAggregator(MIN_FLOATS, ImmutableList.of(29f, 17f, 1729f)));
-
- assertEquals(ImmutableList.of(17.0), applyAggregator(MIN_DOUBLES, ImmutableList.of(29.0, 17.0, 1729.0)));
-
- assertEquals(ImmutableList.of(29), applyAggregator(MIN_INTS, ImmutableList.of(29, 170, 1729)));
-
- assertEquals(
- ImmutableList.of(new BigInteger("17")),
- applyAggregator(MIN_BIGINTS,
- ImmutableList.of(new BigInteger("29"), new BigInteger("17"), new BigInteger("1729"))));
- }
-
- @Test
- public void testMaxN() {
- assertEquals(ImmutableList.of(98, 1009),
- applyAggregator(new MaxNAggregator<Integer>(2), ImmutableList.of(17, 34, 98, 29, 1009)));
- }
-
- @Test
- public void testMinN() {
- assertEquals(ImmutableList.of(17, 29),
- applyAggregator(new MinNAggregator<Integer>(2), ImmutableList.of(17, 34, 98, 29, 1009)));
- }
-
- @Test
- public void testFirstN() {
- assertEquals(ImmutableList.of(17, 34),
- applyAggregator(new FirstNAggregator<Integer>(2), ImmutableList.of(17, 34, 98, 29, 1009)));
- }
-
- @Test
- public void testLastN() {
- assertEquals(ImmutableList.of(29, 1009),
- applyAggregator(new LastNAggregator<Integer>(2), ImmutableList.of(17, 34, 98, 29, 1009)));
- }
-
- @Test
- public void testPairs() {
- List<Pair<Long, Double>> input = ImmutableList.of(Pair.of(1720L, 17.29), Pair.of(9L, -3.14));
- Aggregator<Pair<Long, Double>> a = new PairAggregator<Long, Double>(SUM_LONGS.create(), MIN_DOUBLES.create());
- assertEquals(Pair.of(1729L, -3.14), Iterables.getOnlyElement(applyAggregator(a, input)));
- }
-
- @Test
- public void testPairsTwoLongs() {
- List<Pair<Long, Long>> input = ImmutableList.of(Pair.of(1720L, 1L), Pair.of(9L, 19L));
- Aggregator<Pair<Long, Long>> a = new PairAggregator<Long, Long>(SUM_LONGS.create(), SUM_LONGS.create());
- assertEquals(Pair.of(1729L, 20L), Iterables.getOnlyElement(applyAggregator(a, input)));
- }
-
- @Test
- public void testTrips() {
- List<Tuple3<Float, Double, Double>> input = ImmutableList.of(Tuple3.of(17.29f, 12.2, 0.1),
- Tuple3.of(3.0f, 1.2, 3.14), Tuple3.of(-1.0f, 14.5, -0.98));
- Aggregator<Tuple3<Float, Double, Double>> a = new TripAggregator<Float, Double, Double>(MAX_FLOATS.create(),
- MAX_DOUBLES.create(), MIN_DOUBLES.create());
- assertEquals(Tuple3.of(17.29f, 14.5, -0.98), Iterables.getOnlyElement(applyAggregator(a, input)));
- }
-
- @Test
- public void testQuads() {
- List<Tuple4<Float, Double, Double, Integer>> input = ImmutableList.of(Tuple4.of(17.29f, 12.2, 0.1, 1),
- Tuple4.of(3.0f, 1.2, 3.14, 2), Tuple4.of(-1.0f, 14.5, -0.98, 3));
- Aggregator<Tuple4<Float, Double, Double, Integer>> a = new QuadAggregator<Float, Double, Double, Integer>(
- MAX_FLOATS.create(), MAX_DOUBLES.create(), MIN_DOUBLES.create(), SUM_INTS.create());
- assertEquals(Tuple4.of(17.29f, 14.5, -0.98, 6), Iterables.getOnlyElement(applyAggregator(a, input)));
- }
-
- @Test
- public void testTupleN() {
- List<TupleN> input = ImmutableList.of(new TupleN(1, 3.0, 1, 2.0, 4L), new TupleN(4, 17.0, 1, 9.7, 12L));
- Aggregator<TupleN> a = new TupleNAggregator(MIN_INTS.create(), SUM_DOUBLES.create(), MAX_INTS.create(),
- MIN_DOUBLES.create(), MAX_LONGS.create());
- assertEquals(new TupleN(1, 20.0, 1, 2.0, 12L), Iterables.getOnlyElement(applyAggregator(a, input)));
- }
-
- @Test
- public void testConcatenation() {
- String[] arrayNull = new String[] { null, "" };
- assertEquals(ImmutableList.of("foofoobarbar"), applyAggregator(
- new StringConcatAggregator("", true), ImmutableList.of("foo", "foobar", "bar")));
- assertEquals(ImmutableList.of("foo/foobar/bar"), applyAggregator(
- new StringConcatAggregator("/", false), ImmutableList.of("foo", "foobar", "bar")));
- assertEquals(ImmutableList.of(" "), applyAggregator(
- new StringConcatAggregator(" ", true), ImmutableList.of(" ", "")));
- assertEquals(ImmutableList.of(""), applyAggregator(
- new StringConcatAggregator(" ", true), Arrays.asList(arrayNull)));
- assertEquals(ImmutableList.of("foo bar"), applyAggregator(
- new StringConcatAggregator(" ", true, 20, 3), ImmutableList.of("foo", "foobar", "bar")));
- assertEquals(ImmutableList.of("foo foobar"), applyAggregator(
- new StringConcatAggregator(" ", true, 10, 6), ImmutableList.of("foo", "foobar", "bar")));
- assertEquals(ImmutableList.of("foo bar"), applyAggregator(
- new StringConcatAggregator(" ", true, 9, 6), ImmutableList.of("foo", "foobar", "bar")));
- }
-
- @Test
- public void testConcatenationReset() {
- StringConcatAggregator a = new StringConcatAggregator(" ", true, 10, 6);
-
- assertEquals(ImmutableList.of("foo foobar"), applyAggregator(a, ImmutableList.of("foo", "foobar", "bar")));
- assertEquals(ImmutableList.of("foo foobar"), applyAggregator(a, ImmutableList.of("foo", "foobar", "bar")));
- }
-
- @Test(expected = NullPointerException.class)
- public void testConcatenationNullException() {
- String[] arrayNull = new String[] { null, "" };
- assertEquals(ImmutableList.of(""), applyAggregator(
- new StringConcatAggregator(" ", false), Arrays.asList(arrayNull)));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/NotFnTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/NotFnTest.java b/crunch/src/test/java/org/apache/crunch/NotFnTest.java
deleted file mode 100644
index 8af17a2..0000000
--- a/crunch/src/test/java/org/apache/crunch/NotFnTest.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.*;
-import static org.junit.Assert.fail;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.when;
-
-import org.apache.crunch.FilterFn.NotFn;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-import org.junit.Before;
-import org.junit.Test;
-
-public class NotFnTest {
-
- private FilterFn<Integer> base;
- private NotFn<Integer> notFn;
-
- @Before
- public void setUp() {
- base = mock(FilterFn.class);
- notFn = new NotFn(base);
- }
-
- @Test
- public void testSetContext() {
- TaskInputOutputContext<?, ?, ?, ?> context = mock(TaskInputOutputContext.class);
-
- notFn.setContext(context);
-
- verify(base).setContext(context);
- }
-
- @Test
- public void testAccept_True() {
- when(base.accept(1)).thenReturn(true);
-
- assertFalse(notFn.accept(1));
- }
-
- @Test
- public void testAccept_False() {
- when(base.accept(1)).thenReturn(false);
-
- assertTrue(notFn.accept(1));
- }
-
- @Test
- public void testCleanupEmitterOfT() {
- notFn.cleanup(mock(Emitter.class));
-
- verify(base).cleanup();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/OrFnTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/OrFnTest.java b/crunch/src/test/java/org/apache/crunch/OrFnTest.java
deleted file mode 100644
index fde2376..0000000
--- a/crunch/src/test/java/org/apache/crunch/OrFnTest.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.when;
-
-import org.apache.crunch.FilterFn.OrFn;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-import org.junit.Before;
-import org.junit.Test;
-
-public class OrFnTest {
-
- private FilterFn<Integer> fnA;
- private FilterFn<Integer> fnB;
- private OrFn<Integer> orFn;
-
- @Before
- public void setUp() {
- fnA = mock(FilterFn.class);
- fnB = mock(FilterFn.class);
- orFn = new OrFn(fnA, fnB);
- }
-
- @Test
- public void testSetContext() {
- TaskInputOutputContext<?, ?, ?, ?> context = mock(TaskInputOutputContext.class);
-
- orFn.setContext(context);
-
- verify(fnA).setContext(context);
- verify(fnB).setContext(context);
- }
-
- @Test
- public void testAccept_True() {
- when(fnA.accept(1)).thenReturn(false);
- when(fnB.accept(1)).thenReturn(true);
-
- assertTrue(orFn.accept(1));
- }
-
- @Test
- public void testAccept_False() {
- when(fnA.accept(1)).thenReturn(false);
- when(fnB.accept(1)).thenReturn(false);
-
- assertFalse(orFn.accept(1));
- }
-
- @Test
- public void testCleanupEmitterOfT() {
- orFn.cleanup(mock(Emitter.class));
-
- verify(fnA).cleanup();
- verify(fnB).cleanup();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/PairTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/PairTest.java b/crunch/src/test/java/org/apache/crunch/PairTest.java
deleted file mode 100644
index 106413c..0000000
--- a/crunch/src/test/java/org/apache/crunch/PairTest.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import org.junit.Test;
-
-public class PairTest {
-
- @Test
- public void testPairConstructor() {
- Pair<String, Integer> pair = new Pair<String, Integer>("brock", 45);
- test(pair);
- }
-
- @Test
- public void testPairOf() {
- Pair<String, Integer> pair = Pair.of("brock", 45);
- test(pair);
- }
-
- protected void test(Pair<String, Integer> pair) {
- assertTrue(pair.size() == 2);
-
- assertEquals("brock", pair.first());
- assertEquals(new Integer(45), pair.second());
- assertEquals(Pair.of("brock", 45), pair);
-
- assertEquals("brock", pair.get(0));
- assertEquals(new Integer(45), pair.get(1));
-
- try {
- pair.get(-1);
- fail();
- } catch (IndexOutOfBoundsException e) {
- // expected
- }
- }
-
- @Test
- public void testPairComparisons() {
- assertEquals(0, Pair.of(null, null).compareTo(Pair.of(null, null)));
- assertEquals(0, Pair.of(1, 2).compareTo(Pair.of(1, 2)));
- assertTrue(Pair.of(2, "a").compareTo(Pair.of(1, "a")) > 0);
- assertTrue(Pair.of("a", 2).compareTo(Pair.of("a", 1)) > 0);
- assertTrue(Pair.of(null, 17).compareTo(Pair.of(null, 29)) < 0);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/TupleTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/TupleTest.java b/crunch/src/test/java/org/apache/crunch/TupleTest.java
deleted file mode 100644
index b07ec3f..0000000
--- a/crunch/src/test/java/org/apache/crunch/TupleTest.java
+++ /dev/null
@@ -1,139 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import org.apache.crunch.types.TupleFactory;
-import org.junit.Test;
-
-public class TupleTest {
- private String first = "foo";
- private Integer second = 1729;
- private Double third = 64.2;
- private Boolean fourth = false;
- private Float fifth = 17.29f;
-
- @Test
- public void testTuple3() {
- Tuple3<String, Integer, Double> t = new Tuple3<String, Integer, Double>(first, second, third);
- assertEquals(3, t.size());
- assertEquals(first, t.first());
- assertEquals(second, t.second());
- assertEquals(third, t.third());
- assertEquals(first, t.get(0));
- assertEquals(second, t.get(1));
- assertEquals(third, t.get(2));
- try {
- t.get(-1);
- fail();
- } catch (IndexOutOfBoundsException e) {
- // expected
- }
- }
-
- @Test
- public void testTuple3Equality() {
- Tuple3<String, Integer, Double> t = new Tuple3<String, Integer, Double>(first, second, third);
- assertTrue(t.equals(new Tuple3(first, second, third)));
- assertFalse(t.equals(new Tuple3(first, null, third)));
- assertFalse((new Tuple3(null, null, null)).equals(t));
- assertTrue((new Tuple3(first, null, null)).equals(new Tuple3(first, null, null)));
- }
-
- @Test
- public void testTuple4() {
- Tuple4<String, Integer, Double, Boolean> t = new Tuple4<String, Integer, Double, Boolean>(first, second, third,
- fourth);
- assertEquals(4, t.size());
- assertEquals(first, t.first());
- assertEquals(second, t.second());
- assertEquals(third, t.third());
- assertEquals(fourth, t.fourth());
- assertEquals(first, t.get(0));
- assertEquals(second, t.get(1));
- assertEquals(third, t.get(2));
- assertEquals(fourth, t.get(3));
- try {
- t.get(-1);
- fail();
- } catch (IndexOutOfBoundsException e) {
- // expected
- }
- }
-
- @Test
- public void testTuple4Equality() {
- Tuple4<String, Integer, Double, Boolean> t = new Tuple4<String, Integer, Double, Boolean>(first, second, third,
- fourth);
- assertFalse(t.equals(new Tuple3(first, second, third)));
- assertFalse(t.equals(new Tuple4(first, null, third, null)));
- assertFalse((new Tuple4(null, null, null, null)).equals(t));
- assertTrue((new Tuple4(first, null, third, null)).equals(new Tuple4(first, null, third, null)));
- }
-
- @Test
- public void testTupleN() {
- TupleN t = new TupleN(first, second, third, fourth, fifth);
- assertEquals(5, t.size());
- assertEquals(first, t.get(0));
- assertEquals(second, t.get(1));
- assertEquals(third, t.get(2));
- assertEquals(fourth, t.get(3));
- assertEquals(fifth, t.get(4));
- try {
- t.get(-1);
- fail();
- } catch (IndexOutOfBoundsException e) {
- // expected
- }
- }
-
- @Test
- public void testTupleNEquality() {
- TupleN t = new TupleN(first, second, third, fourth, fifth);
- assertTrue(t.equals(new TupleN(first, second, third, fourth, fifth)));
- assertFalse(t.equals(new TupleN(first, null, third, null)));
- assertFalse((new TupleN(null, null, null, null, null)).equals(t));
- assertTrue((new TupleN(first, second, third, null, null)).equals(new TupleN(first, second, third, null, null)));
- }
-
- @Test
- public void testTupleFactory() {
- checkTuple(TupleFactory.PAIR.makeTuple("a", "b"), Pair.class, "a", "b");
- checkTuple(TupleFactory.TUPLE3.makeTuple("a", "b", "c"), Tuple3.class, "a", "b", "c");
- checkTuple(TupleFactory.TUPLE4.makeTuple("a", "b", "c", "d"), Tuple4.class, "a", "b", "c", "d");
- checkTuple(TupleFactory.TUPLEN.makeTuple("a", "b", "c", "d", "e"), TupleN.class, "a", "b", "c", "d", "e");
-
- checkTuple(TupleFactory.TUPLEN.makeTuple("a", "b"), TupleN.class, "a", "b");
- checkTuple(TupleFactory.TUPLEN.makeTuple("a", "b", "c"), TupleN.class, "a", "b", "c");
- checkTuple(TupleFactory.TUPLEN.makeTuple("a", "b", "c", "d"), TupleN.class, "a", "b", "c", "d");
- checkTuple(TupleFactory.TUPLEN.makeTuple("a", "b", "c", "d", "e"), TupleN.class, "a", "b", "c", "d", "e");
- }
-
- private void checkTuple(Tuple t, Class<? extends Tuple> type, Object... values) {
- assertEquals(type, t.getClass());
- assertEquals(values.length, t.size());
- for (int i = 0; i < values.length; i++)
- assertEquals(values[i], t.get(i));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/WriteModeTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/WriteModeTest.java b/crunch/src/test/java/org/apache/crunch/WriteModeTest.java
deleted file mode 100644
index e99ac7b..0000000
--- a/crunch/src/test/java/org/apache/crunch/WriteModeTest.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.crunch.Target.WriteMode;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.io.To;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableList;
-
-public class WriteModeTest {
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test(expected=CrunchRuntimeException.class)
- public void testDefault() throws Exception {
- run(null, true);
- }
-
- @Test(expected=CrunchRuntimeException.class)
- public void testDefaultNoRun() throws Exception {
- run(null, false);
- }
-
- @Test
- public void testOverwrite() throws Exception {
- Path p = run(WriteMode.OVERWRITE, true);
- PCollection<String> lines = MemPipeline.getInstance().readTextFile(p.toString());
- assertEquals(ImmutableList.of("some", "string", "values"), lines.materialize());
- }
-
- @Test(expected=CrunchRuntimeException.class)
- public void testOverwriteNoRun() throws Exception {
- run(WriteMode.OVERWRITE, false);
- }
-
- @Test
- public void testAppend() throws Exception {
- Path p = run(WriteMode.APPEND, true);
- PCollection<String> lines = MemPipeline.getInstance().readTextFile(p.toString());
- assertEquals(ImmutableList.of("some", "string", "values", "some", "string", "values"),
- lines.materialize());
- }
-
- @Test
- public void testAppendNoRun() throws Exception {
- Path p = run(WriteMode.APPEND, false);
- PCollection<String> lines = MemPipeline.getInstance().readTextFile(p.toString());
- assertEquals(ImmutableList.of("some", "string", "values", "some", "string", "values"),
- lines.materialize());
- }
-
- Path run(WriteMode writeMode, boolean doRun) throws Exception {
- Path output = tmpDir.getPath("existing");
- FileSystem fs = FileSystem.get(tmpDir.getDefaultConfiguration());
- if (fs.exists(output)) {
- fs.delete(output, true);
- }
- Pipeline p = MemPipeline.getInstance();
- PCollection<String> data = MemPipeline.typedCollectionOf(Avros.strings(),
- ImmutableList.of("some", "string", "values"));
- data.write(To.textFile(output));
-
- if (doRun) {
- p.run();
- }
-
- if (writeMode == null) {
- data.write(To.textFile(output));
- } else {
- data.write(To.textFile(output), writeMode);
- }
-
- p.run();
-
- return output;
- }
-}
[11/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/plan/JobPrototype.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/JobPrototype.java b/crunch/src/main/java/org/apache/crunch/impl/mr/plan/JobPrototype.java
deleted file mode 100644
index f22b5a1..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/JobPrototype.java
+++ /dev/null
@@ -1,245 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.plan;
-
-import java.io.IOException;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.Target;
-import org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchControlledJob;
-import org.apache.crunch.impl.mr.collect.DoTableImpl;
-import org.apache.crunch.impl.mr.collect.PCollectionImpl;
-import org.apache.crunch.impl.mr.collect.PGroupedTableImpl;
-import org.apache.crunch.impl.mr.exec.CrunchJobHooks;
-import org.apache.crunch.impl.mr.run.CrunchCombiner;
-import org.apache.crunch.impl.mr.run.CrunchInputFormat;
-import org.apache.crunch.impl.mr.run.CrunchMapper;
-import org.apache.crunch.impl.mr.run.CrunchReducer;
-import org.apache.crunch.impl.mr.run.NodeContext;
-import org.apache.crunch.impl.mr.run.RTNode;
-import org.apache.crunch.util.DistCache;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableSet;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
-
-class JobPrototype {
-
- public static JobPrototype createMapReduceJob(int jobID, PGroupedTableImpl<?, ?> group,
- Set<NodePath> inputs, Path workingPath) {
- return new JobPrototype(jobID, inputs, group, workingPath);
- }
-
- public static JobPrototype createMapOnlyJob(int jobID, HashMultimap<Target, NodePath> mapNodePaths, Path workingPath) {
- return new JobPrototype(jobID, mapNodePaths, workingPath);
- }
-
- private final int jobID; // TODO: maybe stageID sounds better
- private final Set<NodePath> mapNodePaths;
- private final PGroupedTableImpl<?, ?> group;
- private final Set<JobPrototype> dependencies = Sets.newHashSet();
- private final Map<PCollectionImpl<?>, DoNode> nodes = Maps.newHashMap();
- private final Path workingPath;
-
- private HashMultimap<Target, NodePath> targetsToNodePaths;
- private DoTableImpl<?, ?> combineFnTable;
-
- private CrunchControlledJob job;
-
- private JobPrototype(int jobID, Set<NodePath> inputs, PGroupedTableImpl<?, ?> group, Path workingPath) {
- this.jobID = jobID;
- this.mapNodePaths = ImmutableSet.copyOf(inputs);
- this.group = group;
- this.workingPath = workingPath;
- this.targetsToNodePaths = null;
- }
-
- private JobPrototype(int jobID, HashMultimap<Target, NodePath> outputPaths, Path workingPath) {
- this.jobID = jobID;
- this.group = null;
- this.mapNodePaths = null;
- this.workingPath = workingPath;
- this.targetsToNodePaths = outputPaths;
- }
-
- public int getJobID() {
- return jobID;
- }
-
- public boolean isMapOnly() {
- return this.group == null;
- }
-
- Set<NodePath> getMapNodePaths() {
- return mapNodePaths;
- }
-
- PGroupedTableImpl<?, ?> getGroupingTable() {
- return group;
- }
-
- HashMultimap<Target, NodePath> getTargetsToNodePaths() {
- return targetsToNodePaths;
- }
-
- public void addReducePaths(HashMultimap<Target, NodePath> outputPaths) {
- if (group == null) {
- throw new IllegalStateException("Cannot add a reduce phase to a map-only job");
- }
- this.targetsToNodePaths = outputPaths;
- }
-
- public void addDependency(JobPrototype dependency) {
- this.dependencies.add(dependency);
- }
-
- public CrunchControlledJob getCrunchJob(Class<?> jarClass, Configuration conf, Pipeline pipeline) throws IOException {
- if (job == null) {
- job = build(jarClass, conf, pipeline);
- for (JobPrototype proto : dependencies) {
- job.addDependingJob(proto.getCrunchJob(jarClass, conf, pipeline));
- }
- }
- return job;
- }
-
- private CrunchControlledJob build(Class<?> jarClass, Configuration conf, Pipeline pipeline) throws IOException {
- Job job = new Job(conf);
- conf = job.getConfiguration();
- conf.set(PlanningParameters.CRUNCH_WORKING_DIRECTORY, workingPath.toString());
- job.setJarByClass(jarClass);
-
- Set<DoNode> outputNodes = Sets.newHashSet();
- Set<Target> targets = targetsToNodePaths.keySet();
- Path outputPath = new Path(workingPath, "output");
- MSCROutputHandler outputHandler = new MSCROutputHandler(job, outputPath, group == null);
- for (Target target : targets) {
- DoNode node = null;
- for (NodePath nodePath : targetsToNodePaths.get(target)) {
- if (node == null) {
- PCollectionImpl<?> collect = nodePath.tail();
- node = DoNode.createOutputNode(target.toString(), collect.getPType());
- outputHandler.configureNode(node, target);
- }
- outputNodes.add(walkPath(nodePath.descendingIterator(), node));
- }
- }
-
- job.setMapperClass(CrunchMapper.class);
- List<DoNode> inputNodes;
- DoNode reduceNode = null;
- if (group != null) {
- job.setReducerClass(CrunchReducer.class);
- List<DoNode> reduceNodes = Lists.newArrayList(outputNodes);
- serialize(reduceNodes, conf, workingPath, NodeContext.REDUCE);
- reduceNode = reduceNodes.get(0);
-
- if (combineFnTable != null) {
- job.setCombinerClass(CrunchCombiner.class);
- DoNode combinerInputNode = group.createDoNode();
- DoNode combineNode = combineFnTable.createDoNode();
- combineNode.addChild(group.getGroupingNode());
- combinerInputNode.addChild(combineNode);
- serialize(ImmutableList.of(combinerInputNode), conf, workingPath, NodeContext.COMBINE);
- }
-
- group.configureShuffle(job);
-
- DoNode mapOutputNode = group.getGroupingNode();
- Set<DoNode> mapNodes = Sets.newHashSet();
- for (NodePath nodePath : mapNodePaths) {
- // Advance these one step, since we've already configured
- // the grouping node, and the PGroupedTableImpl is the tail
- // of the NodePath.
- Iterator<PCollectionImpl<?>> iter = nodePath.descendingIterator();
- iter.next();
- mapNodes.add(walkPath(iter, mapOutputNode));
- }
- inputNodes = Lists.newArrayList(mapNodes);
- } else { // No grouping
- job.setNumReduceTasks(0);
- inputNodes = Lists.newArrayList(outputNodes);
- }
- serialize(inputNodes, conf, workingPath, NodeContext.MAP);
-
- if (inputNodes.size() == 1) {
- DoNode inputNode = inputNodes.get(0);
- inputNode.getSource().configureSource(job, -1);
- } else {
- for (int i = 0; i < inputNodes.size(); i++) {
- DoNode inputNode = inputNodes.get(i);
- inputNode.getSource().configureSource(job, i);
- }
- job.setInputFormatClass(CrunchInputFormat.class);
- }
- job.setJobName(createJobName(pipeline.getName(), inputNodes, reduceNode));
-
- return new CrunchControlledJob(
- jobID,
- job,
- new CrunchJobHooks.PrepareHook(job),
- new CrunchJobHooks.CompletionHook(job, outputPath, outputHandler.getMultiPaths(), group == null));
- }
-
- private void serialize(List<DoNode> nodes, Configuration conf, Path workingPath, NodeContext context)
- throws IOException {
- List<RTNode> rtNodes = Lists.newArrayList();
- for (DoNode node : nodes) {
- rtNodes.add(node.toRTNode(true, conf, context));
- }
- Path path = new Path(workingPath, context.toString());
- DistCache.write(conf, path, rtNodes);
- }
-
- private String createJobName(String pipelineName, List<DoNode> mapNodes, DoNode reduceNode) {
- JobNameBuilder builder = new JobNameBuilder(pipelineName);
- builder.visit(mapNodes);
- if (reduceNode != null) {
- builder.visit(reduceNode);
- }
- return builder.build();
- }
-
- private DoNode walkPath(Iterator<PCollectionImpl<?>> iter, DoNode working) {
- while (iter.hasNext()) {
- PCollectionImpl<?> collect = iter.next();
- if (combineFnTable != null && !(collect instanceof PGroupedTableImpl)) {
- combineFnTable = null;
- } else if (collect instanceof DoTableImpl && ((DoTableImpl<?, ?>) collect).hasCombineFn()) {
- combineFnTable = (DoTableImpl<?, ?>) collect;
- }
- if (!nodes.containsKey(collect)) {
- nodes.put(collect, collect.createDoNode());
- }
- DoNode parent = nodes.get(collect);
- parent.addChild(working);
- working = parent;
- }
- return working;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/plan/MSCROutputHandler.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/MSCROutputHandler.java b/crunch/src/main/java/org/apache/crunch/impl/mr/plan/MSCROutputHandler.java
deleted file mode 100644
index 36c565e..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/MSCROutputHandler.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.plan;
-
-import java.util.Map;
-
-import org.apache.crunch.Target;
-import org.apache.crunch.io.MapReduceTarget;
-import org.apache.crunch.io.OutputHandler;
-import org.apache.crunch.io.PathTarget;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-
-import com.google.common.collect.Maps;
-
-public class MSCROutputHandler implements OutputHandler {
-
- private final Job job;
- private final Path path;
- private final boolean mapOnlyJob;
-
- private DoNode workingNode;
- private Map<Integer, PathTarget> multiPaths;
- private int jobCount;
-
- public MSCROutputHandler(Job job, Path outputPath, boolean mapOnlyJob) {
- this.job = job;
- this.path = outputPath;
- this.mapOnlyJob = mapOnlyJob;
- this.multiPaths = Maps.newHashMap();
- }
-
- public void configureNode(DoNode node, Target target) {
- workingNode = node;
- target.accept(this, node.getPType());
- }
-
- public boolean configure(Target target, PType<?> ptype) {
- if (target instanceof MapReduceTarget) {
- if (target instanceof PathTarget) {
- multiPaths.put(jobCount, (PathTarget) target);
- }
-
- String name = PlanningParameters.MULTI_OUTPUT_PREFIX + jobCount;
- jobCount++;
- workingNode.setOutputName(name);
- ((MapReduceTarget) target).configureForMapReduce(job, ptype, path, name);
- return true;
- }
-
- return false;
- }
-
- public boolean isMapOnlyJob() {
- return mapOnlyJob;
- }
-
- public Map<Integer, PathTarget> getMultiPaths() {
- return multiPaths;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/plan/MSCRPlanner.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/MSCRPlanner.java b/crunch/src/main/java/org/apache/crunch/impl/mr/plan/MSCRPlanner.java
deleted file mode 100644
index 3e1de38..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/MSCRPlanner.java
+++ /dev/null
@@ -1,378 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.plan;
-
-import java.io.IOException;
-import java.util.Comparator;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
-import java.util.TreeMap;
-
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.Target;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.impl.mr.collect.InputCollection;
-import org.apache.crunch.impl.mr.collect.PCollectionImpl;
-import org.apache.crunch.impl.mr.collect.PGroupedTableImpl;
-import org.apache.crunch.impl.mr.exec.MRExecutor;
-import org.apache.crunch.materialize.MaterializableIterable;
-import org.apache.hadoop.conf.Configuration;
-
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-import com.google.common.collect.Multimap;
-import com.google.common.collect.Sets;
-
-public class MSCRPlanner {
-
- private final MRPipeline pipeline;
- private final Map<PCollectionImpl<?>, Set<Target>> outputs;
- private final Map<PCollectionImpl<?>, MaterializableIterable> toMaterialize;
- private int lastJobID = 0;
-
- public MSCRPlanner(MRPipeline pipeline, Map<PCollectionImpl<?>, Set<Target>> outputs,
- Map<PCollectionImpl<?>, MaterializableIterable> toMaterialize) {
- this.pipeline = pipeline;
- this.outputs = new TreeMap<PCollectionImpl<?>, Set<Target>>(DEPTH_COMPARATOR);
- this.outputs.putAll(outputs);
- this.toMaterialize = toMaterialize;
- }
-
- // Used to ensure that we always build pipelines starting from the deepest
- // outputs, which helps ensure that we handle intermediate outputs correctly.
- private static final Comparator<PCollectionImpl<?>> DEPTH_COMPARATOR = new Comparator<PCollectionImpl<?>>() {
- @Override
- public int compare(PCollectionImpl<?> left, PCollectionImpl<?> right) {
- int cmp = right.getDepth() - left.getDepth();
- if (cmp == 0) {
- // Ensure we don't throw away two output collections at the same depth.
- // Using the collection name would be nicer here, but names aren't
- // necessarily unique.
- cmp = new Integer(right.hashCode()).compareTo(left.hashCode());
- }
- return cmp;
- }
- };
-
- public MRExecutor plan(Class<?> jarClass, Configuration conf) throws IOException {
- Map<PCollectionImpl<?>, Set<SourceTarget<?>>> targetDeps = Maps.newTreeMap(DEPTH_COMPARATOR);
- for (PCollectionImpl<?> pcollect : outputs.keySet()) {
- targetDeps.put(pcollect, pcollect.getTargetDependencies());
- }
-
- Multimap<Vertex, JobPrototype> assignments = HashMultimap.create();
- Multimap<PCollectionImpl<?>, Vertex> protoDependency = HashMultimap.create();
- while (!targetDeps.isEmpty()) {
- Set<Target> allTargets = Sets.newHashSet();
- for (PCollectionImpl<?> pcollect : targetDeps.keySet()) {
- allTargets.addAll(outputs.get(pcollect));
- }
- GraphBuilder graphBuilder = new GraphBuilder();
-
- // Walk the current plan tree and build a graph in which the vertices are
- // sources, targets, and GBK operations.
- Set<PCollectionImpl<?>> currentStage = Sets.newHashSet();
- Set<PCollectionImpl<?>> laterStage = Sets.newHashSet();
- for (PCollectionImpl<?> output : targetDeps.keySet()) {
- if (Sets.intersection(allTargets, targetDeps.get(output)).isEmpty()) {
- graphBuilder.visitOutput(output);
- currentStage.add(output);
- } else {
- laterStage.add(output);
- }
- }
-
- Graph baseGraph = graphBuilder.getGraph();
-
- // Create a new graph that splits up up dependent GBK nodes.
- Graph graph = prepareFinalGraph(baseGraph);
-
- // Break the graph up into connected components.
- List<List<Vertex>> components = graph.connectedComponents();
-
- // For each component, we will create one or more job prototypes,
- // depending on its profile.
- // For dependency handling, we only need to care about which
- // job prototype a particular GBK is assigned to.
- for (List<Vertex> component : components) {
- assignments.putAll(constructJobPrototypes(component));
- }
-
- // Add in the job dependency information here.
- for (Map.Entry<Vertex, JobPrototype> e : assignments.entries()) {
- JobPrototype current = e.getValue();
- List<Vertex> parents = graph.getParents(e.getKey());
- for (Vertex parent : parents) {
- for (JobPrototype parentJobProto : assignments.get(parent)) {
- current.addDependency(parentJobProto);
- }
- }
- }
-
- // Add cross-stage dependencies.
- for (PCollectionImpl<?> output : currentStage) {
- Set<Target> targets = outputs.get(output);
- Vertex vertex = graph.getVertexAt(output);
- for (PCollectionImpl<?> later : laterStage) {
- if (!Sets.intersection(targets, targetDeps.get(later)).isEmpty()) {
- protoDependency.put(later, vertex);
- }
- }
- targetDeps.remove(output);
- }
- }
-
- // Cross-job dependencies.
- for (Entry<PCollectionImpl<?>, Vertex> pd : protoDependency.entries()) {
- Vertex d = new Vertex(pd.getKey());
- Vertex dj = pd.getValue();
- for (JobPrototype parent : assignments.get(dj)) {
- for (JobPrototype child : assignments.get(d)) {
- child.addDependency(parent);
- }
- }
- }
-
- // Finally, construct the jobs from the prototypes and return.
- DotfileWriter dotfileWriter = new DotfileWriter();
- MRExecutor exec = new MRExecutor(jarClass, outputs, toMaterialize);
- for (JobPrototype proto : Sets.newHashSet(assignments.values())) {
- dotfileWriter.addJobPrototype(proto);
- exec.addJob(proto.getCrunchJob(jarClass, conf, pipeline));
- }
-
- String planDotFile = dotfileWriter.buildDotfile();
- exec.setPlanDotFile(planDotFile);
- conf.set(PlanningParameters.PIPELINE_PLAN_DOTFILE, planDotFile);
-
- return exec;
- }
-
- private Graph prepareFinalGraph(Graph baseGraph) {
- Graph graph = new Graph();
-
- for (Vertex baseVertex : baseGraph) {
- // Add all of the vertices in the base graph, but no edges (yet).
- graph.addVertex(baseVertex.getPCollection(), baseVertex.isOutput());
- }
-
- for (Edge e : baseGraph.getAllEdges()) {
- // Add back all of the edges where neither vertex is a GBK and we do not
- // have an output feeding into a GBK.
- if (!(e.getHead().isGBK() && e.getTail().isGBK()) &&
- !(e.getHead().isOutput() && e.getTail().isGBK())) {
- Vertex head = graph.getVertexAt(e.getHead().getPCollection());
- Vertex tail = graph.getVertexAt(e.getTail().getPCollection());
- graph.getEdge(head, tail).addAllNodePaths(e.getNodePaths());
- }
- }
-
- for (Vertex baseVertex : baseGraph) {
- if (baseVertex.isGBK()) {
- Vertex vertex = graph.getVertexAt(baseVertex.getPCollection());
- for (Edge e : baseVertex.getIncomingEdges()) {
- if (e.getHead().isOutput()) {
- // Execute an edge split.
- Vertex splitTail = e.getHead();
- PCollectionImpl<?> split = splitTail.getPCollection();
- InputCollection<?> inputNode = handleSplitTarget(split);
- Vertex splitHead = graph.addVertex(inputNode, false);
-
- // Divide up the node paths in the edge between the two GBK nodes so
- // that each node is either owned by GBK1 -> newTail or newHead -> GBK2.
- for (NodePath path : e.getNodePaths()) {
- NodePath headPath = path.splitAt(split, splitHead.getPCollection());
- graph.getEdge(vertex, splitTail).addNodePath(headPath);
- graph.getEdge(splitHead, vertex).addNodePath(path);
- }
-
- // Note the dependency between the vertices in the graph.
- graph.markDependency(splitHead, splitTail);
- } else if (!e.getHead().isGBK()) {
- Vertex newHead = graph.getVertexAt(e.getHead().getPCollection());
- graph.getEdge(newHead, vertex).addAllNodePaths(e.getNodePaths());
- }
- }
- for (Edge e : baseVertex.getOutgoingEdges()) {
- if (!e.getTail().isGBK()) {
- Vertex newTail = graph.getVertexAt(e.getTail().getPCollection());
- graph.getEdge(vertex, newTail).addAllNodePaths(e.getNodePaths());
- } else {
- // Execute an Edge split
- Vertex newGraphTail = graph.getVertexAt(e.getTail().getPCollection());
- PCollectionImpl split = e.getSplit();
- InputCollection<?> inputNode = handleSplitTarget(split);
- Vertex splitTail = graph.addVertex(split, true);
- Vertex splitHead = graph.addVertex(inputNode, false);
-
- // Divide up the node paths in the edge between the two GBK nodes so
- // that each node is either owned by GBK1 -> newTail or newHead -> GBK2.
- for (NodePath path : e.getNodePaths()) {
- NodePath headPath = path.splitAt(split, splitHead.getPCollection());
- graph.getEdge(vertex, splitTail).addNodePath(headPath);
- graph.getEdge(splitHead, newGraphTail).addNodePath(path);
- }
-
- // Note the dependency between the vertices in the graph.
- graph.markDependency(splitHead, splitTail);
- }
- }
- }
- }
-
- return graph;
- }
-
- private Multimap<Vertex, JobPrototype> constructJobPrototypes(List<Vertex> component) {
- Multimap<Vertex, JobPrototype> assignment = HashMultimap.create();
- List<Vertex> gbks = Lists.newArrayList();
- for (Vertex v : component) {
- if (v.isGBK()) {
- gbks.add(v);
- }
- }
-
- if (gbks.isEmpty()) {
- HashMultimap<Target, NodePath> outputPaths = HashMultimap.create();
- for (Vertex v : component) {
- if (v.isInput()) {
- for (Edge e : v.getOutgoingEdges()) {
- for (NodePath nodePath : e.getNodePaths()) {
- PCollectionImpl target = nodePath.tail();
- for (Target t : outputs.get(target)) {
- outputPaths.put(t, nodePath);
- }
- }
- }
- }
- }
- if (outputPaths.isEmpty()) {
- throw new IllegalStateException("No outputs?");
- }
- JobPrototype prototype = JobPrototype.createMapOnlyJob(
- ++lastJobID, outputPaths, pipeline.createTempPath());
- for (Vertex v : component) {
- assignment.put(v, prototype);
- }
- } else {
- Set<Edge> usedEdges = Sets.newHashSet();
- for (Vertex g : gbks) {
- Set<NodePath> inputs = Sets.newHashSet();
- for (Edge e : g.getIncomingEdges()) {
- inputs.addAll(e.getNodePaths());
- usedEdges.add(e);
- }
- JobPrototype prototype = JobPrototype.createMapReduceJob(
- ++lastJobID, (PGroupedTableImpl) g.getPCollection(), inputs, pipeline.createTempPath());
- assignment.put(g, prototype);
- for (Edge e : g.getIncomingEdges()) {
- assignment.put(e.getHead(), prototype);
- usedEdges.add(e);
- }
- HashMultimap<Target, NodePath> outputPaths = HashMultimap.create();
- for (Edge e : g.getOutgoingEdges()) {
- Vertex output = e.getTail();
- for (Target t : outputs.get(output.getPCollection())) {
- outputPaths.putAll(t, e.getNodePaths());
- }
- assignment.put(output, prototype);
- usedEdges.add(e);
- }
- prototype.addReducePaths(outputPaths);
- }
-
- // Check for any un-assigned vertices, which should be map-side outputs
- // that we will need to run in a map-only job.
- HashMultimap<Target, NodePath> outputPaths = HashMultimap.create();
- Set<Vertex> orphans = Sets.newHashSet();
- for (Vertex v : component) {
-
- // Check if this vertex has multiple inputs but only a subset of
- // them have already been assigned
- boolean vertexHasUnassignedIncomingEdges = false;
- if (v.isOutput()) {
- for (Edge e : v.getIncomingEdges()) {
- if (!usedEdges.contains(e)) {
- vertexHasUnassignedIncomingEdges = true;
- }
- }
- }
-
- if (v.isOutput() && (vertexHasUnassignedIncomingEdges || !assignment.containsKey(v))) {
- orphans.add(v);
- for (Edge e : v.getIncomingEdges()) {
- if (vertexHasUnassignedIncomingEdges && usedEdges.contains(e)) {
- // We've already dealt with this incoming edge
- continue;
- }
- orphans.add(e.getHead());
- for (NodePath nodePath : e.getNodePaths()) {
- PCollectionImpl target = nodePath.tail();
- for (Target t : outputs.get(target)) {
- outputPaths.put(t, nodePath);
- }
- }
- }
- }
-
- }
- if (!outputPaths.isEmpty()) {
- JobPrototype prototype = JobPrototype.createMapOnlyJob(
- ++lastJobID, outputPaths, pipeline.createTempPath());
- for (Vertex orphan : orphans) {
- assignment.put(orphan, prototype);
- }
- }
- }
-
- return assignment;
- }
-
- private InputCollection<?> handleSplitTarget(PCollectionImpl<?> splitTarget) {
- if (!outputs.containsKey(splitTarget)) {
- outputs.put(splitTarget, Sets.<Target> newHashSet());
- }
-
- SourceTarget srcTarget = null;
- Target targetToReplace = null;
- for (Target t : outputs.get(splitTarget)) {
- if (t instanceof SourceTarget) {
- srcTarget = (SourceTarget<?>) t;
- break;
- } else {
- srcTarget = t.asSourceTarget(splitTarget.getPType());
- if (srcTarget != null) {
- targetToReplace = t;
- break;
- }
- }
- }
- if (targetToReplace != null) {
- outputs.get(splitTarget).remove(targetToReplace);
- } else if (srcTarget == null) {
- srcTarget = pipeline.createIntermediateOutput(splitTarget.getPType());
- }
- outputs.get(splitTarget).add(srcTarget);
- splitTarget.materializeAt(srcTarget);
-
- return (InputCollection<?>) pipeline.read(srcTarget);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/plan/NodePath.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/NodePath.java b/crunch/src/main/java/org/apache/crunch/impl/mr/plan/NodePath.java
deleted file mode 100644
index a090d93..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/NodePath.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.plan;
-
-import java.util.Iterator;
-import java.util.LinkedList;
-
-import org.apache.crunch.impl.mr.collect.PCollectionImpl;
-
-import com.google.common.collect.Lists;
-
-class NodePath implements Iterable<PCollectionImpl<?>> {
- private LinkedList<PCollectionImpl<?>> path;
-
- public NodePath() {
- this.path = Lists.newLinkedList();
- }
-
- public NodePath(PCollectionImpl<?> tail) {
- this.path = Lists.newLinkedList();
- this.path.add(tail);
- }
-
- public NodePath(NodePath other) {
- this.path = Lists.newLinkedList(other.path);
- }
-
- public void push(PCollectionImpl<?> stage) {
- this.path.push((PCollectionImpl<?>) stage);
- }
-
- public NodePath close(PCollectionImpl<?> head) {
- this.path.push(head);
- return this;
- }
-
- public Iterator<PCollectionImpl<?>> iterator() {
- return path.iterator();
- }
-
- public Iterator<PCollectionImpl<?>> descendingIterator() {
- return path.descendingIterator();
- }
-
- public PCollectionImpl<?> get(int index) {
- return path.get(index);
- }
-
- public PCollectionImpl<?> head() {
- return path.peekFirst();
- }
-
- public PCollectionImpl<?> tail() {
- return path.peekLast();
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !(other instanceof NodePath)) {
- return false;
- }
- NodePath nodePath = (NodePath) other;
- return path.equals(nodePath.path);
- }
-
- @Override
- public int hashCode() {
- return 17 + 37 * path.hashCode();
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- for (PCollectionImpl<?> collect : path) {
- sb.append(collect.getName() + "|");
- }
- sb.deleteCharAt(sb.length() - 1);
- return sb.toString();
- }
-
- public NodePath splitAt(int splitIndex, PCollectionImpl<?> newHead) {
- NodePath top = new NodePath();
- for (int i = 0; i <= splitIndex; i++) {
- top.path.add(path.get(i));
- }
- LinkedList<PCollectionImpl<?>> nextPath = Lists.newLinkedList();
- nextPath.add(newHead);
- nextPath.addAll(path.subList(splitIndex + 1, path.size()));
- path = nextPath;
- return top;
- }
-
- public NodePath splitAt(PCollectionImpl split, PCollectionImpl<?> newHead) {
- NodePath top = new NodePath();
- int splitIndex = 0;
- for (PCollectionImpl p : path) {
- top.path.add(p);
- if (p == split) {
- break;
- }
- splitIndex++;
- }
- LinkedList<PCollectionImpl<?>> nextPath = Lists.newLinkedList();
- nextPath.add(newHead);
- nextPath.addAll(path.subList(splitIndex + 1, path.size()));
- path = nextPath;
- return top;
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/plan/PlanningParameters.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/PlanningParameters.java b/crunch/src/main/java/org/apache/crunch/impl/mr/plan/PlanningParameters.java
deleted file mode 100644
index b90a911..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/PlanningParameters.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.plan;
-
-/**
- * Collection of Configuration keys and various constants used when planning MapReduce jobs for a
- * pipeline.
- */
-public class PlanningParameters {
-
- public static final String MULTI_OUTPUT_PREFIX = "out";
-
- public static final String CRUNCH_WORKING_DIRECTORY = "crunch.work.dir";
-
- /**
- * Configuration key under which a <a href="http://www.graphviz.org">DOT</a> file containing the
- * pipeline job graph is stored by the planner.
- */
- public static final String PIPELINE_PLAN_DOTFILE = "crunch.planner.dotfile";
-
- private PlanningParameters() {
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/plan/Vertex.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/Vertex.java b/crunch/src/main/java/org/apache/crunch/impl/mr/plan/Vertex.java
deleted file mode 100644
index f4aa668..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/Vertex.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.plan;
-
-import java.util.List;
-import java.util.Set;
-
-import org.apache.commons.lang.builder.ReflectionToStringBuilder;
-import org.apache.commons.lang.builder.ToStringStyle;
-import org.apache.crunch.Source;
-import org.apache.crunch.impl.mr.collect.InputCollection;
-import org.apache.crunch.impl.mr.collect.PCollectionImpl;
-import org.apache.crunch.impl.mr.collect.PGroupedTableImpl;
-
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-
-/**
- *
- */
-class Vertex {
- private final PCollectionImpl impl;
-
- private boolean output;
- private Set<Edge> incoming;
- private Set<Edge> outgoing;
-
- public Vertex(PCollectionImpl impl) {
- this.impl = impl;
- this.incoming = Sets.newHashSet();
- this.outgoing = Sets.newHashSet();
- }
-
- public PCollectionImpl getPCollection() {
- return impl;
- }
-
- public boolean isInput() {
- return impl instanceof InputCollection;
- }
-
- public boolean isGBK() {
- return impl instanceof PGroupedTableImpl;
- }
-
- public void setOutput() {
- this.output = true;
- }
-
- public boolean isOutput() {
- return output;
- }
-
- public Source getSource() {
- if (isInput()) {
- return ((InputCollection) impl).getSource();
- }
- return null;
- }
-
- public void addIncoming(Edge edge) {
- this.incoming.add(edge);
- }
-
- public void addOutgoing(Edge edge) {
- this.outgoing.add(edge);
- }
-
- public List<Vertex> getAllNeighbors() {
- List<Vertex> n = Lists.newArrayList();
- for (Edge e : incoming) {
- n.add(e.getHead());
- }
- for (Edge e : outgoing) {
- n.add(e.getTail());
- }
- return n;
- }
-
- public Set<Edge> getAllEdges() {
- return Sets.union(incoming, outgoing);
- }
-
- public Set<Edge> getIncomingEdges() {
- return incoming;
- }
-
- public Set<Edge> getOutgoingEdges() {
- return outgoing;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == null || !(obj instanceof Vertex)) {
- return false;
- }
- Vertex other = (Vertex) obj;
- return impl.equals(other.impl);
- }
-
- @Override
- public int hashCode() {
- return 17 + 37 * impl.hashCode();
- }
-
- @Override
- public String toString() {
- return new ReflectionToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE).setExcludeFieldNames(
- new String[] { "outgoing", "incoming" }).toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchCombiner.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchCombiner.java b/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchCombiner.java
deleted file mode 100644
index 47a3ded..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchCombiner.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.run;
-
-public class CrunchCombiner extends CrunchReducer {
-
- @Override
- protected NodeContext getNodeContext() {
- return NodeContext.COMBINE;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputFormat.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputFormat.java b/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputFormat.java
deleted file mode 100644
index eb5dd8a..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputFormat.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.run;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.crunch.io.CrunchInputs;
-import org.apache.crunch.io.FormatBundle;
-import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.InputFormat;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.JobContext;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.util.ReflectionUtils;
-
-import com.google.common.collect.Lists;
-
-public class CrunchInputFormat<K, V> extends InputFormat<K, V> {
-
- @Override
- public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
- List<InputSplit> splits = Lists.newArrayList();
- Configuration base = job.getConfiguration();
- Map<FormatBundle, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs.getFormatNodeMap(job);
-
- // First, build a map of InputFormats to Paths
- for (Map.Entry<FormatBundle, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) {
- FormatBundle inputBundle = entry.getKey();
- Configuration conf = new Configuration(base);
- inputBundle.configure(conf);
- Job jobCopy = new Job(conf);
- InputFormat<?, ?> format = (InputFormat<?, ?>) ReflectionUtils.newInstance(inputBundle.getFormatClass(),
- jobCopy.getConfiguration());
- for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) {
- Integer nodeIndex = nodeEntry.getKey();
- List<Path> paths = nodeEntry.getValue();
- FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));
-
- // Get splits for each input path and tag with InputFormat
- // and Mapper types by wrapping in a TaggedInputSplit.
- List<InputSplit> pathSplits = format.getSplits(jobCopy);
- for (InputSplit pathSplit : pathSplits) {
- splits.add(new CrunchInputSplit(pathSplit, inputBundle.getFormatClass(),
- nodeIndex, jobCopy.getConfiguration()));
- }
- }
- }
- return splits;
- }
-
- @Override
- public RecordReader<K, V> createRecordReader(InputSplit inputSplit, TaskAttemptContext context) throws IOException,
- InterruptedException {
- return new CrunchRecordReader<K, V>(inputSplit, context);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputSplit.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputSplit.java b/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputSplit.java
deleted file mode 100644
index b41062b..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputSplit.java
+++ /dev/null
@@ -1,116 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.run;
-
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.IOException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.serializer.Deserializer;
-import org.apache.hadoop.io.serializer.SerializationFactory;
-import org.apache.hadoop.io.serializer.Serializer;
-import org.apache.hadoop.mapreduce.InputFormat;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.util.ReflectionUtils;
-
-class CrunchInputSplit extends InputSplit implements Writable {
-
- private InputSplit inputSplit;
- private Class<? extends InputFormat<?, ?>> inputFormatClass;
- private int nodeIndex;
- private Configuration conf;
-
- public CrunchInputSplit() {
- // default constructor
- }
-
- public CrunchInputSplit(
- InputSplit inputSplit,
- Class<? extends InputFormat<?, ?>> inputFormatClass,
- int nodeIndex,
- Configuration conf) {
- this.inputSplit = inputSplit;
- this.inputFormatClass = inputFormatClass;
- this.nodeIndex = nodeIndex;
- this.conf = conf;
- }
-
- public Configuration getConf() {
- return conf;
- }
-
- public int getNodeIndex() {
- return nodeIndex;
- }
-
- public InputSplit getInputSplit() {
- return inputSplit;
- }
-
- public Class<? extends InputFormat<?, ?>> getInputFormatClass() {
- return inputFormatClass;
- }
-
- @Override
- public long getLength() throws IOException, InterruptedException {
- return inputSplit.getLength();
- }
-
- @Override
- public String[] getLocations() throws IOException, InterruptedException {
- return inputSplit.getLocations();
- }
-
- public void readFields(DataInput in) throws IOException {
- nodeIndex = in.readInt();
- conf = new Configuration();
- conf.readFields(in);
- inputFormatClass = (Class<? extends InputFormat<?, ?>>) readClass(in);
- Class<? extends InputSplit> inputSplitClass = (Class<? extends InputSplit>) readClass(in);
- inputSplit = (InputSplit) ReflectionUtils.newInstance(inputSplitClass, conf);
- SerializationFactory factory = new SerializationFactory(conf);
- Deserializer deserializer = factory.getDeserializer(inputSplitClass);
- deserializer.open((DataInputStream) in);
- inputSplit = (InputSplit) deserializer.deserialize(inputSplit);
- }
-
- private Class<?> readClass(DataInput in) throws IOException {
- String className = Text.readString(in);
- try {
- return conf.getClassByName(className);
- } catch (ClassNotFoundException e) {
- throw new RuntimeException("readObject can't find class", e);
- }
- }
-
- public void write(DataOutput out) throws IOException {
- out.writeInt(nodeIndex);
- conf.write(out);
- Text.writeString(out, inputFormatClass.getName());
- Text.writeString(out, inputSplit.getClass().getName());
- SerializationFactory factory = new SerializationFactory(conf);
- Serializer serializer = factory.getSerializer(inputSplit.getClass());
- serializer.open((DataOutputStream) out);
- serializer.serialize(inputSplit);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchMapper.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchMapper.java b/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchMapper.java
deleted file mode 100644
index 70f0b01..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchMapper.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.run;
-
-import java.io.IOException;
-import java.util.List;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.hadoop.mapreduce.Mapper;
-
-public class CrunchMapper extends Mapper<Object, Object, Object, Object> {
-
- private static final Log LOG = LogFactory.getLog(CrunchMapper.class);
-
- private RTNode node;
- private CrunchTaskContext ctxt;
- private boolean debug;
-
- @Override
- protected void setup(Mapper<Object, Object, Object, Object>.Context context) {
- List<RTNode> nodes;
- this.ctxt = new CrunchTaskContext(context, NodeContext.MAP);
- try {
- nodes = ctxt.getNodes();
- } catch (IOException e) {
- LOG.info("Crunch deserialization error", e);
- throw new CrunchRuntimeException(e);
- }
- if (nodes.size() == 1) {
- this.node = nodes.get(0);
- } else {
- CrunchInputSplit split = (CrunchInputSplit) context.getInputSplit();
- this.node = nodes.get(split.getNodeIndex());
- }
- this.debug = ctxt.isDebugRun();
- }
-
- @Override
- protected void map(Object k, Object v, Mapper<Object, Object, Object, Object>.Context context) {
- if (debug) {
- try {
- node.process(k, v);
- } catch (Exception e) {
- LOG.error("Mapper exception", e);
- }
- } else {
- node.process(k, v);
- }
- }
-
- @Override
- protected void cleanup(Mapper<Object, Object, Object, Object>.Context context) {
- node.cleanup();
- ctxt.cleanup();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchRecordReader.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchRecordReader.java b/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchRecordReader.java
deleted file mode 100644
index fc8fb32..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchRecordReader.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.run;
-
-import java.io.IOException;
-
-import org.apache.crunch.hadoop.mapreduce.TaskAttemptContextFactory;
-import org.apache.hadoop.mapreduce.InputFormat;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.util.ReflectionUtils;
-
-class CrunchRecordReader<K, V> extends RecordReader<K, V> {
-
- private final RecordReader<K, V> delegate;
-
- public CrunchRecordReader(InputSplit inputSplit, final TaskAttemptContext context) throws IOException,
- InterruptedException {
- CrunchInputSplit crunchSplit = (CrunchInputSplit) inputSplit;
- InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils.newInstance(crunchSplit.getInputFormatClass(),
- crunchSplit.getConf());
- this.delegate = inputFormat.createRecordReader(crunchSplit.getInputSplit(),
- TaskAttemptContextFactory.create(crunchSplit.getConf(), context.getTaskAttemptID()));
- }
-
- @Override
- public void close() throws IOException {
- delegate.close();
- }
-
- @Override
- public K getCurrentKey() throws IOException, InterruptedException {
- return delegate.getCurrentKey();
- }
-
- @Override
- public V getCurrentValue() throws IOException, InterruptedException {
- return delegate.getCurrentValue();
- }
-
- @Override
- public float getProgress() throws IOException, InterruptedException {
- return delegate.getProgress();
- }
-
- @Override
- public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
- CrunchInputSplit crunchSplit = (CrunchInputSplit) inputSplit;
- InputSplit delegateSplit = crunchSplit.getInputSplit();
- delegate.initialize(delegateSplit,
- TaskAttemptContextFactory.create(crunchSplit.getConf(), context.getTaskAttemptID()));
- }
-
- @Override
- public boolean nextKeyValue() throws IOException, InterruptedException {
- return delegate.nextKeyValue();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchReducer.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchReducer.java b/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchReducer.java
deleted file mode 100644
index e5ddbd2..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchReducer.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.run;
-
-import java.io.IOException;
-import java.util.List;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.impl.SingleUseIterable;
-import org.apache.hadoop.mapreduce.Reducer;
-
-public class CrunchReducer extends Reducer<Object, Object, Object, Object> {
-
- private static final Log LOG = LogFactory.getLog(CrunchReducer.class);
-
- private RTNode node;
- private CrunchTaskContext ctxt;
- private boolean debug;
-
- protected NodeContext getNodeContext() {
- return NodeContext.REDUCE;
- }
-
- @Override
- protected void setup(Reducer<Object, Object, Object, Object>.Context context) {
- this.ctxt = new CrunchTaskContext(context, getNodeContext());
- try {
- List<RTNode> nodes = ctxt.getNodes();
- this.node = nodes.get(0);
- } catch (IOException e) {
- LOG.info("Crunch deserialization error", e);
- throw new CrunchRuntimeException(e);
- }
- this.debug = ctxt.isDebugRun();
- }
-
- @Override
- protected void reduce(Object key, Iterable<Object> values, Reducer<Object, Object, Object, Object>.Context context) {
- values = new SingleUseIterable<Object>(values);
- if (debug) {
- try {
- node.processIterable(key, values);
- } catch (Exception e) {
- LOG.error("Reducer exception", e);
- }
- } else {
- node.processIterable(key, values);
- }
- }
-
- @Override
- protected void cleanup(Reducer<Object, Object, Object, Object>.Context context) {
- node.cleanup();
- ctxt.cleanup();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchTaskContext.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchTaskContext.java b/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchTaskContext.java
deleted file mode 100644
index c4f2873..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/run/CrunchTaskContext.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.run;
-
-import java.io.IOException;
-import java.util.List;
-
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.impl.mr.plan.PlanningParameters;
-import org.apache.crunch.io.CrunchOutputs;
-import org.apache.crunch.util.DistCache;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-
-class CrunchTaskContext {
-
- private final TaskInputOutputContext<Object, Object, Object, Object> taskContext;
- private final NodeContext nodeContext;
- private CrunchOutputs<Object, Object> multipleOutputs;
-
- public CrunchTaskContext(TaskInputOutputContext<Object, Object, Object, Object> taskContext, NodeContext nodeContext) {
- this.taskContext = taskContext;
- this.nodeContext = nodeContext;
- }
-
- public TaskInputOutputContext<Object, Object, Object, Object> getContext() {
- return taskContext;
- }
-
- public NodeContext getNodeContext() {
- return nodeContext;
- }
-
- public List<RTNode> getNodes() throws IOException {
- Configuration conf = taskContext.getConfiguration();
- Path path = new Path(new Path(conf.get(PlanningParameters.CRUNCH_WORKING_DIRECTORY)), nodeContext.toString());
- @SuppressWarnings("unchecked")
- List<RTNode> nodes = (List<RTNode>) DistCache.read(conf, path);
- if (nodes != null) {
- for (RTNode node : nodes) {
- node.initialize(this);
- }
- }
- return nodes;
- }
-
- public boolean isDebugRun() {
- Configuration conf = taskContext.getConfiguration();
- return conf.getBoolean(RuntimeParameters.DEBUG, false);
- }
-
- public void cleanup() {
- if (multipleOutputs != null) {
- try {
- multipleOutputs.close();
- } catch (IOException e) {
- throw new CrunchRuntimeException(e);
- } catch (InterruptedException e) {
- throw new CrunchRuntimeException(e);
- }
- }
- }
-
- public CrunchOutputs<Object, Object> getMultipleOutputs() {
- if (multipleOutputs == null) {
- multipleOutputs = new CrunchOutputs<Object, Object>(taskContext);
- }
- return multipleOutputs;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/run/NodeContext.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/run/NodeContext.java b/crunch/src/main/java/org/apache/crunch/impl/mr/run/NodeContext.java
deleted file mode 100644
index ffc9e7c..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/run/NodeContext.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.run;
-
-import org.apache.crunch.impl.mr.plan.DoNode;
-
-/**
- * Enum that is associated with a serialized {@link DoNode} instance, so we know
- * how to use it within the context of a particular MR job.
- *
- */
-public enum NodeContext {
- MAP,
- REDUCE,
- COMBINE;
-
- public String getConfigurationKey() {
- return "crunch.donode." + toString().toLowerCase();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/run/RTNode.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/run/RTNode.java b/crunch/src/main/java/org/apache/crunch/impl/mr/run/RTNode.java
deleted file mode 100644
index ce7b795..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/run/RTNode.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.run;
-
-import java.io.Serializable;
-import java.util.List;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.impl.mr.emit.IntermediateEmitter;
-import org.apache.crunch.impl.mr.emit.MultipleOutputEmitter;
-import org.apache.crunch.impl.mr.emit.OutputEmitter;
-import org.apache.crunch.types.Converter;
-import org.apache.crunch.types.PType;
-
-public class RTNode implements Serializable {
-
- private static final Log LOG = LogFactory.getLog(RTNode.class);
-
- private final String nodeName;
- private DoFn<Object, Object> fn;
- private PType<Object> outputPType;
- private final List<RTNode> children;
- private final Converter inputConverter;
- private final Converter outputConverter;
- private final String outputName;
-
- private transient Emitter<Object> emitter;
-
- public RTNode(DoFn<Object, Object> fn, PType<Object> outputPType, String name, List<RTNode> children,
- Converter inputConverter,
- Converter outputConverter, String outputName) {
- this.fn = fn;
- this.outputPType = outputPType;
- this.nodeName = name;
- this.children = children;
- this.inputConverter = inputConverter;
- this.outputConverter = outputConverter;
- this.outputName = outputName;
- }
-
- public void initialize(CrunchTaskContext ctxt) {
- if (emitter != null) {
- // Already initialized
- return;
- }
-
- fn.setContext(ctxt.getContext());
- fn.initialize();
- for (RTNode child : children) {
- child.initialize(ctxt);
- }
-
- if (outputConverter != null) {
- if (outputName != null) {
- this.emitter = new MultipleOutputEmitter(outputConverter, ctxt.getMultipleOutputs(),
- outputName);
- } else {
- this.emitter = new OutputEmitter(outputConverter, ctxt.getContext());
- }
- } else if (!children.isEmpty()) {
- this.emitter = new IntermediateEmitter(outputPType, children,
- ctxt.getContext().getConfiguration());
- } else {
- throw new CrunchRuntimeException("Invalid RTNode config: no emitter for: " + nodeName);
- }
- }
-
- public boolean isLeafNode() {
- return outputConverter != null && children.isEmpty();
- }
-
- public void process(Object input) {
- try {
- fn.process(input, emitter);
- } catch (CrunchRuntimeException e) {
- if (!e.wasLogged()) {
- LOG.info(String.format("Crunch exception in '%s' for input: %s", nodeName, input.toString()), e);
- e.markLogged();
- }
- throw e;
- }
- }
-
- public void process(Object key, Object value) {
- process(inputConverter.convertInput(key, value));
- }
-
- public void processIterable(Object key, Iterable values) {
- process(inputConverter.convertIterableInput(key, values));
- }
-
- public void cleanup() {
- fn.cleanup(emitter);
- emitter.flush();
- for (RTNode child : children) {
- child.cleanup();
- }
- }
-
- @Override
- public String toString() {
- return "RTNode [nodeName=" + nodeName + ", fn=" + fn + ", children=" + children + ", inputConverter="
- + inputConverter + ", outputConverter=" + outputConverter + ", outputName=" + outputName + "]";
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/run/RuntimeParameters.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/run/RuntimeParameters.java b/crunch/src/main/java/org/apache/crunch/impl/mr/run/RuntimeParameters.java
deleted file mode 100644
index 604c49c..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/run/RuntimeParameters.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.run;
-
-/**
- * Parameters used during the runtime execution.
- */
-public class RuntimeParameters {
-
- public static final String AGGREGATOR_BUCKETS = "crunch.aggregator.buckets";
-
- public static final String DEBUG = "crunch.debug";
-
- public static final String TMP_DIR = "crunch.tmp.dir";
-
- public static final String LOG_JOB_PROGRESS = "crunch.log.job.progress";
-
- public static final String CREATE_DIR = "mapreduce.jobcontrol.createdir.ifnotexist";
-
- // Not instantiated
- private RuntimeParameters() {
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/At.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/At.java b/crunch/src/main/java/org/apache/crunch/io/At.java
deleted file mode 100644
index a6f0782..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/At.java
+++ /dev/null
@@ -1,257 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import org.apache.avro.specific.SpecificRecord;
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.TableSourceTarget;
-import org.apache.crunch.io.avro.AvroFileSourceTarget;
-import org.apache.crunch.io.seq.SeqFileSourceTarget;
-import org.apache.crunch.io.seq.SeqFileTableSourceTarget;
-import org.apache.crunch.io.text.TextFileSourceTarget;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroType;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.crunch.types.writable.Writables;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Writable;
-
-/**
- * <p>Static factory methods for creating common {@link SourceTarget} types, which may be treated as both a {@code Source}
- * and a {@code Target}.</p>
- *
- * <p>The {@code At} methods is analogous to the {@link From} and {@link To} factory methods, but is used for
- * storing intermediate outputs that need to be passed from one run of a MapReduce pipeline to another run. The
- * {@code SourceTarget} object acts as both a {@code Source} and a {@Target}, which enables it to provide this
- * functionality.
- *
- * <code>
- * Pipeline pipeline = new MRPipeline(this.getClass());
- * // Create our intermediate storage location
- * SourceTarget<String> intermediate = At.textFile("/temptext");
- * ...
- * // Write out the output of the first phase of a pipeline.
- * pipeline.write(phase1, intermediate);
- *
- * // Explicitly call run to kick off the pipeline.
- * pipeline.run();
- *
- * // And then kick off a second phase by consuming the output
- * // from the first phase.
- * PCollection<String> phase2Input = pipeline.read(intermediate);
- * ...
- * </code>
- * </p>
- *
- * <p>The {@code SourceTarget} abstraction is useful when we care about reading the intermediate
- * outputs of a pipeline as well as the final results.</p>
- */
-public class At {
-
- /**
- * Creates a {@code SourceTarget<T>} instance from the Avro file(s) at the given path name.
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param avroClass The subclass of {@code SpecificRecord} to use for the Avro file
- * @return A new {@code SourceTarget<T>} instance
- */
- public static <T extends SpecificRecord> SourceTarget<T> avroFile(String pathName, Class<T> avroClass) {
- return avroFile(new Path(pathName), avroClass);
- }
-
- /**
- * Creates a {@code SourceTarget<T>} instance from the Avro file(s) at the given {@code Path}.
- *
- * @param path The {@code Path} to the data
- * @param avroClass The subclass of {@code SpecificRecord} to use for the Avro file
- * @return A new {@code SourceTarget<T>} instance
- */
- public static <T extends SpecificRecord> SourceTarget<T> avroFile(Path path, Class<T> avroClass) {
- return avroFile(path, Avros.specifics(avroClass));
- }
-
- /**
- * Creates a {@code SourceTarget<T>} instance from the Avro file(s) at the given path name.
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param avroType The {@code AvroType} for the Avro records
- * @return A new {@code SourceTarget<T>} instance
- */
- public static <T> SourceTarget<T> avroFile(String pathName, AvroType<T> avroType) {
- return avroFile(new Path(pathName), avroType);
- }
-
- /**
- * Creates a {@code SourceTarget<T>} instance from the Avro file(s) at the given {@code Path}.
- *
- * @param path The {@code Path} to the data
- * @param avroType The {@code AvroType} for the Avro records
- * @return A new {@code SourceTarget<T>} instance
- */
- public static <T> SourceTarget<T> avroFile(Path path, AvroType<T> avroType) {
- return new AvroFileSourceTarget<T>(path, avroType);
- }
-
- /**
- * Creates a {@code SourceTarget<T>} instance from the SequenceFile(s) at the given path name
- * from the value field of each key-value pair in the SequenceFile(s).
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param valueClass The {@code Writable} type for the value of the SequenceFile entry
- * @return A new {@code SourceTarget<T>} instance
- */
- public static <T extends Writable> SourceTarget<T> sequenceFile(String pathName, Class<T> valueClass) {
- return sequenceFile(new Path(pathName), valueClass);
- }
-
- /**
- * Creates a {@code SourceTarget<T>} instance from the SequenceFile(s) at the given {@code Path}
- * from the value field of each key-value pair in the SequenceFile(s).
- *
- * @param path The {@code Path} to the data
- * @param valueClass The {@code Writable} type for the value of the SequenceFile entry
- * @return A new {@code SourceTarget<T>} instance
- */
- public static <T extends Writable> SourceTarget<T> sequenceFile(Path path, Class<T> valueClass) {
- return sequenceFile(path, Writables.writables(valueClass));
- }
-
- /**
- * Creates a {@code SourceTarget<T>} instance from the SequenceFile(s) at the given path name
- * from the value field of each key-value pair in the SequenceFile(s).
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param ptype The {@code PType} for the value of the SequenceFile entry
- * @return A new {@code SourceTarget<T>} instance
- */
- public static <T> SourceTarget<T> sequenceFile(String pathName, PType<T> ptype) {
- return sequenceFile(new Path(pathName), ptype);
- }
-
- /**
- * Creates a {@code SourceTarget<T>} instance from the SequenceFile(s) at the given {@code Path}
- * from the value field of each key-value pair in the SequenceFile(s).
- *
- * @param path The {@code Path} to the data
- * @param ptype The {@code PType} for the value of the SequenceFile entry
- * @return A new {@code SourceTarget<T>} instance
- */
- public static <T> SourceTarget<T> sequenceFile(Path path, PType<T> ptype) {
- return new SeqFileSourceTarget<T>(path, ptype);
- }
-
- /**
- * Creates a {@code TableSourceTarget<K, V>} instance from the SequenceFile(s) at the given path name
- * from the key-value pairs in the SequenceFile(s).
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param keyClass The {@code Writable} type for the key of the SequenceFile entry
- * @param valueClass The {@code Writable} type for the value of the SequenceFile entry
- * @return A new {@code TableSourceTarget<K, V>} instance
- */
- public static <K extends Writable, V extends Writable> TableSourceTarget<K, V> sequenceFile(
- String pathName, Class<K> keyClass, Class<V> valueClass) {
- return sequenceFile(new Path(pathName), keyClass, valueClass);
- }
-
- /**
- * Creates a {@code TableSourceTarget<K, V>} instance from the SequenceFile(s) at the given {@code Path}
- * from the key-value pairs in the SequenceFile(s).
- *
- * @param path The {@code Path} to the data
- * @param keyClass The {@code Writable} type for the key of the SequenceFile entry
- * @param valueClass The {@code Writable} type for the value of the SequenceFile entry
- * @return A new {@code TableSourceTarget<K, V>} instance
- */
- public static <K extends Writable, V extends Writable> TableSourceTarget<K, V> sequenceFile(
- Path path, Class<K> keyClass, Class<V> valueClass) {
- return sequenceFile(path, Writables.writables(keyClass), Writables.writables(valueClass));
- }
-
- /**
- * Creates a {@code TableSourceTarget<K, V>} instance from the SequenceFile(s) at the given path name
- * from the key-value pairs in the SequenceFile(s).
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param keyType The {@code PType} for the key of the SequenceFile entry
- * @param valueType The {@code PType} for the value of the SequenceFile entry
- * @return A new {@code TableSourceTarget<K, V>} instance
- */
- public static <K, V> TableSourceTarget<K, V> sequenceFile(String pathName, PType<K> keyType, PType<V> valueType) {
- return sequenceFile(new Path(pathName), keyType, valueType);
- }
-
- /**
- * Creates a {@code TableSourceTarget<K, V>} instance from the SequenceFile(s) at the given {@code Path}
- * from the key-value pairs in the SequenceFile(s).
- *
- * @param path The {@code Path} to the data
- * @param keyType The {@code PType} for the key of the SequenceFile entry
- * @param valueType The {@code PType} for the value of the SequenceFile entry
- * @return A new {@code TableSourceTarget<K, V>} instance
- */
- public static <K, V> TableSourceTarget<K, V> sequenceFile(Path path, PType<K> keyType, PType<V> valueType) {
- PTypeFamily ptf = keyType.getFamily();
- return new SeqFileTableSourceTarget<K, V>(path, ptf.tableOf(keyType, valueType));
- }
-
- /**
- * Creates a {@code SourceTarget<String>} instance for the text file(s) at the given path name.
- *
- * @param pathName The name of the path to the data on the filesystem
- * @return A new {@code SourceTarget<String>} instance
- */
- public static SourceTarget<String> textFile(String pathName) {
- return textFile(new Path(pathName));
- }
-
- /**
- * Creates a {@code SourceTarget<String>} instance for the text file(s) at the given {@code Path}.
- *
- * @param path The {@code Path} to the data
- * @return A new {@code SourceTarget<String>} instance
- */
- public static SourceTarget<String> textFile(Path path) {
- return textFile(path, Writables.strings());
- }
-
- /**
- * Creates a {@code SourceTarget<T>} instance for the text file(s) at the given path name using
- * the provided {@code PType<T>} to convert the input text.
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param ptype The {@code PType<T>} to use to process the input text
- * @return A new {@code SourceTarget<T>} instance
- */
- public static <T> SourceTarget<T> textFile(String pathName, PType<T> ptype) {
- return textFile(new Path(pathName), ptype);
- }
-
- /**
- * Creates a {@code SourceTarget<T>} instance for the text file(s) at the given {@code Path} using
- * the provided {@code PType<T>} to convert the input text.
- *
- * @param path The {@code Path} to the data
- * @param ptype The {@code PType<T>} to use to process the input text
- * @return A new {@code SourceTarget<T>} instance
- */
- public static <T> SourceTarget<T> textFile(Path path, PType<T> ptype) {
- return new TextFileSourceTarget<T>(path, ptype);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/CompositePathIterable.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/CompositePathIterable.java b/crunch/src/main/java/org/apache/crunch/io/CompositePathIterable.java
deleted file mode 100644
index a4723e9..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/CompositePathIterable.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.Collections;
-import java.util.Iterator;
-
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-
-import com.google.common.collect.UnmodifiableIterator;
-
-public class CompositePathIterable<T> implements Iterable<T> {
-
- private final FileStatus[] stati;
- private final FileSystem fs;
- private final FileReaderFactory<T> readerFactory;
-
- private static final PathFilter FILTER = new PathFilter() {
- @Override
- public boolean accept(Path path) {
- return !path.getName().startsWith("_");
- }
- };
-
- public static <S> Iterable<S> create(FileSystem fs, Path path, FileReaderFactory<S> readerFactory) throws IOException {
-
- if (!fs.exists(path)) {
- throw new IOException("No files found to materialize at: " + path);
- }
-
- FileStatus[] stati = null;
- try {
- stati = fs.listStatus(path, FILTER);
- } catch (FileNotFoundException e) {
- stati = null;
- }
- if (stati == null) {
- throw new IOException("No files found to materialize at: " + path);
- }
-
- if (stati.length == 0) {
- return Collections.emptyList();
- } else {
- return new CompositePathIterable<S>(stati, fs, readerFactory);
- }
-
- }
-
- private CompositePathIterable(FileStatus[] stati, FileSystem fs, FileReaderFactory<T> readerFactory) {
- this.stati = stati;
- this.fs = fs;
- this.readerFactory = readerFactory;
- }
-
- @Override
- public Iterator<T> iterator() {
-
- return new UnmodifiableIterator<T>() {
- private int index = 0;
- private Iterator<T> iter = readerFactory.read(fs, stati[index++].getPath());
-
- @Override
- public boolean hasNext() {
- if (!iter.hasNext()) {
- while (index < stati.length) {
- iter = readerFactory.read(fs, stati[index++].getPath());
- if (iter.hasNext()) {
- return true;
- }
- }
- return false;
- }
- return true;
- }
-
- @Override
- public T next() {
- return iter.next();
- }
- };
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/CrunchInputs.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/CrunchInputs.java b/crunch/src/main/java/org/apache/crunch/io/CrunchInputs.java
deleted file mode 100644
index d154db2..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/CrunchInputs.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.InputFormat;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.JobContext;
-
-import com.google.common.base.Joiner;
-import com.google.common.base.Splitter;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-/**
- * Helper functions for configuring multiple {@code InputFormat} instances within a single
- * Crunch MapReduce job.
- */
-public class CrunchInputs {
- public static final String CRUNCH_INPUTS = "crunch.inputs.dir";
-
- private static final char RECORD_SEP = ',';
- private static final char FIELD_SEP = ';';
- private static final Joiner JOINER = Joiner.on(FIELD_SEP);
- private static final Splitter SPLITTER = Splitter.on(FIELD_SEP);
-
- public static void addInputPath(Job job, Path path, FormatBundle inputBundle, int nodeIndex) {
- Configuration conf = job.getConfiguration();
- String inputs = JOINER.join(inputBundle.serialize(), String.valueOf(nodeIndex), path.toString());
- String existing = conf.get(CRUNCH_INPUTS);
- conf.set(CRUNCH_INPUTS, existing == null ? inputs : existing + RECORD_SEP + inputs);
- }
-
- public static Map<FormatBundle, Map<Integer, List<Path>>> getFormatNodeMap(JobContext job) {
- Map<FormatBundle, Map<Integer, List<Path>>> formatNodeMap = Maps.newHashMap();
- Configuration conf = job.getConfiguration();
- for (String input : Splitter.on(RECORD_SEP).split(conf.get(CRUNCH_INPUTS))) {
- List<String> fields = Lists.newArrayList(SPLITTER.split(input));
- FormatBundle<InputFormat> inputBundle = FormatBundle.fromSerialized(fields.get(0), InputFormat.class);
- if (!formatNodeMap.containsKey(inputBundle)) {
- formatNodeMap.put(inputBundle, Maps.<Integer, List<Path>> newHashMap());
- }
- Integer nodeIndex = Integer.valueOf(fields.get(1));
- if (!formatNodeMap.get(inputBundle).containsKey(nodeIndex)) {
- formatNodeMap.get(inputBundle).put(nodeIndex, Lists.<Path> newLinkedList());
- }
- formatNodeMap.get(inputBundle).get(nodeIndex).add(new Path(fields.get(2)));
- }
- return formatNodeMap;
- }
-
-}
[39/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/maugham.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/maugham.txt b/crunch-core/src/it/resources/maugham.txt
new file mode 100644
index 0000000..16c45e8
--- /dev/null
+++ b/crunch-core/src/it/resources/maugham.txt
@@ -0,0 +1,29112 @@
+The Project Gutenberg EBook of Of Human Bondage, by W. Somerset Maugham
+
+This eBook is for the use of anyone anywhere at no cost and with
+almost no restrictions whatsoever. You may copy it, give it away or
+re-use it under the terms of the Project Gutenberg License included
+with this eBook or online at www.gutenberg.net
+
+
+Title: Of Human Bondage
+
+Author: W. Somerset Maugham
+
+Release Date: May 6, 2008 [EBook #351]
+
+Language: English
+
+
+*** START OF THIS PROJECT GUTENBERG EBOOK OF HUMAN BONDAGE ***
+
+
+
+
+
+
+
+
+
+
+
+
+OF HUMAN BONDAGE
+
+
+BY
+
+W. SOMERSET MAUGHAM
+
+
+
+
+I
+
+The day broke gray and dull. The clouds hung heavily, and there was a
+rawness in the air that suggested snow. A woman servant came into a room
+in which a child was sleeping and drew the curtains. She glanced
+mechanically at the house opposite, a stucco house with a portico, and
+went to the child's bed.
+
+"Wake up, Philip," she said.
+
+She pulled down the bed-clothes, took him in her arms, and carried him
+downstairs. He was only half awake.
+
+"Your mother wants you," she said.
+
+She opened the door of a room on the floor below and took the child over
+to a bed in which a woman was lying. It was his mother. She stretched out
+her arms, and the child nestled by her side. He did not ask why he had
+been awakened. The woman kissed his eyes, and with thin, small hands felt
+the warm body through his white flannel nightgown. She pressed him closer
+to herself.
+
+"Are you sleepy, darling?" she said.
+
+Her voice was so weak that it seemed to come already from a great
+distance. The child did not answer, but smiled comfortably. He was very
+happy in the large, warm bed, with those soft arms about him. He tried to
+make himself smaller still as he cuddled up against his mother, and he
+kissed her sleepily. In a moment he closed his eyes and was fast asleep.
+The doctor came forwards and stood by the bed-side.
+
+"Oh, don't take him away yet," she moaned.
+
+The doctor, without answering, looked at her gravely. Knowing she would
+not be allowed to keep the child much longer, the woman kissed him again;
+and she passed her hand down his body till she came to his feet; she held
+the right foot in her hand and felt the five small toes; and then slowly
+passed her hand over the left one. She gave a sob.
+
+"What's the matter?" said the doctor. "You're tired."
+
+She shook her head, unable to speak, and the tears rolled down her cheeks.
+The doctor bent down.
+
+"Let me take him."
+
+She was too weak to resist his wish, and she gave the child up. The doctor
+handed him back to his nurse.
+
+"You'd better put him back in his own bed."
+
+"Very well, sir." The little boy, still sleeping, was taken away. His
+mother sobbed now broken-heartedly.
+
+"What will happen to him, poor child?"
+
+The monthly nurse tried to quiet her, and presently, from exhaustion, the
+crying ceased. The doctor walked to a table on the other side of the room,
+upon which, under a towel, lay the body of a still-born child. He lifted
+the towel and looked. He was hidden from the bed by a screen, but the
+woman guessed what he was doing.
+
+"Was it a girl or a boy?" she whispered to the nurse.
+
+"Another boy."
+
+The woman did not answer. In a moment the child's nurse came back. She
+approached the bed.
+
+"Master Philip never woke up," she said. There was a pause. Then the
+doctor felt his patient's pulse once more.
+
+"I don't think there's anything I can do just now," he said. "I'll call
+again after breakfast."
+
+"I'll show you out, sir," said the child's nurse.
+
+They walked downstairs in silence. In the hall the doctor stopped.
+
+"You've sent for Mrs. Carey's brother-in-law, haven't you?"
+
+"Yes, sir."
+
+"D'you know at what time he'll be here?"
+
+"No, sir, I'm expecting a telegram."
+
+"What about the little boy? I should think he'd be better out of the way."
+
+"Miss Watkin said she'd take him, sir."
+
+"Who's she?"
+
+"She's his godmother, sir. D'you think Mrs. Carey will get over it, sir?"
+
+The doctor shook his head.
+
+
+
+II
+
+It was a week later. Philip was sitting on the floor in the drawing-room
+at Miss Watkin's house in Onslow gardens. He was an only child and used to
+amusing himself. The room was filled with massive furniture, and on each
+of the sofas were three big cushions. There was a cushion too in each
+arm-chair. All these he had taken and, with the help of the gilt rout
+chairs, light and easy to move, had made an elaborate cave in which he
+could hide himself from the Red Indians who were lurking behind the
+curtains. He put his ear to the floor and listened to the herd of
+buffaloes that raced across the prairie. Presently, hearing the door open,
+he held his breath so that he might not be discovered; but a violent hand
+piled away a chair and the cushions fell down.
+
+"You naughty boy, Miss Watkin WILL be cross with you."
+
+"Hulloa, Emma!" he said.
+
+The nurse bent down and kissed him, then began to shake out the cushions,
+and put them back in their places.
+
+"Am I to come home?" he asked.
+
+"Yes, I've come to fetch you."
+
+"You've got a new dress on."
+
+It was in eighteen-eighty-five, and she wore a bustle. Her gown was of
+black velvet, with tight sleeves and sloping shoulders, and the skirt had
+three large flounces. She wore a black bonnet with velvet strings. She
+hesitated. The question she had expected did not come, and so she could
+not give the answer she had prepared.
+
+"Aren't you going to ask how your mamma is?" she said at length.
+
+"Oh, I forgot. How is mamma?"
+
+Now she was ready.
+
+"Your mamma is quite well and happy."
+
+"Oh, I am glad."
+
+"Your mamma's gone away. You won't ever see her any more." Philip did not
+know what she meant.
+
+"Why not?"
+
+"Your mamma's in heaven."
+
+She began to cry, and Philip, though he did not quite understand, cried
+too. Emma was a tall, big-boned woman, with fair hair and large features.
+She came from Devonshire and, notwithstanding her many years of service in
+London, had never lost the breadth of her accent. Her tears increased her
+emotion, and she pressed the little boy to her heart. She felt vaguely the
+pity of that child deprived of the only love in the world that is quite
+unselfish. It seemed dreadful that he must be handed over to strangers.
+But in a little while she pulled herself together.
+
+"Your Uncle William is waiting in to see you," she said. "Go and say
+good-bye to Miss Watkin, and we'll go home."
+
+"I don't want to say good-bye," he answered, instinctively anxious to hide
+his tears.
+
+"Very well, run upstairs and get your hat."
+
+He fetched it, and when he came down Emma was waiting for him in the hall.
+He heard the sound of voices in the study behind the dining-room. He
+paused. He knew that Miss Watkin and her sister were talking to friends,
+and it seemed to him--he was nine years old--that if he went in they would
+be sorry for him.
+
+"I think I'll go and say good-bye to Miss Watkin."
+
+"I think you'd better," said Emma.
+
+"Go in and tell them I'm coming," he said.
+
+He wished to make the most of his opportunity. Emma knocked at the door
+and walked in. He heard her speak.
+
+"Master Philip wants to say good-bye to you, miss."
+
+There was a sudden hush of the conversation, and Philip limped in.
+Henrietta Watkin was a stout woman, with a red face and dyed hair. In
+those days to dye the hair excited comment, and Philip had heard much
+gossip at home when his godmother's changed colour. She lived with an
+elder sister, who had resigned herself contentedly to old age. Two ladies,
+whom Philip did not know, were calling, and they looked at him curiously.
+
+"My poor child," said Miss Watkin, opening her arms.
+
+She began to cry. Philip understood now why she had not been in to
+luncheon and why she wore a black dress. She could not speak.
+
+"I've got to go home," said Philip, at last.
+
+He disengaged himself from Miss Watkin's arms, and she kissed him again.
+Then he went to her sister and bade her good-bye too. One of the strange
+ladies asked if she might kiss him, and he gravely gave her permission.
+Though crying, he keenly enjoyed the sensation he was causing; he would
+have been glad to stay a little longer to be made much of, but felt they
+expected him to go, so he said that Emma was waiting for him. He went out
+of the room. Emma had gone downstairs to speak with a friend in the
+basement, and he waited for her on the landing. He heard Henrietta
+Watkin's voice.
+
+"His mother was my greatest friend. I can't bear to think that she's
+dead."
+
+"You oughtn't to have gone to the funeral, Henrietta," said her sister. "I
+knew it would upset you."
+
+Then one of the strangers spoke.
+
+"Poor little boy, it's dreadful to think of him quite alone in the world.
+I see he limps."
+
+"Yes, he's got a club-foot. It was such a grief to his mother."
+
+Then Emma came back. They called a hansom, and she told the driver where
+to go.
+
+
+
+III
+
+
+When they reached the house Mrs. Carey had died in--it was in a dreary,
+respectable street between Notting Hill Gate and High Street,
+Kensington--Emma led Philip into the drawing-room. His uncle was writing
+letters of thanks for the wreaths which had been sent. One of them, which
+had arrived too late for the funeral, lay in its cardboard box on the
+hall-table.
+
+"Here's Master Philip," said Emma.
+
+Mr. Carey stood up slowly and shook hands with the little boy. Then on
+second thoughts he bent down and kissed his forehead. He was a man of
+somewhat less than average height, inclined to corpulence, with his hair,
+worn long, arranged over the scalp so as to conceal his baldness. He was
+clean-shaven. His features were regular, and it was possible to imagine
+that in his youth he had been good-looking. On his watch-chain he wore a
+gold cross.
+
+"You're going to live with me now, Philip," said Mr. Carey. "Shall you
+like that?"
+
+Two years before Philip had been sent down to stay at the vicarage after
+an attack of chicken-pox; but there remained with him a recollection of an
+attic and a large garden rather than of his uncle and aunt.
+
+"Yes."
+
+"You must look upon me and your Aunt Louisa as your father and mother."
+
+The child's mouth trembled a little, he reddened, but did not answer.
+
+"Your dear mother left you in my charge."
+
+Mr. Carey had no great ease in expressing himself. When the news came that
+his sister-in-law was dying, he set off at once for London, but on the way
+thought of nothing but the disturbance in his life that would be caused if
+her death forced him to undertake the care of her son. He was well over
+fifty, and his wife, to whom he had been married for thirty years, was
+childless; he did not look forward with any pleasure to the presence of a
+small boy who might be noisy and rough. He had never much liked his
+sister-in-law.
+
+"I'm going to take you down to Blackstable tomorrow," he said.
+
+"With Emma?"
+
+The child put his hand in hers, and she pressed it.
+
+"I'm afraid Emma must go away," said Mr. Carey.
+
+"But I want Emma to come with me."
+
+Philip began to cry, and the nurse could not help crying too. Mr. Carey
+looked at them helplessly.
+
+"I think you'd better leave me alone with Master Philip for a moment."
+
+"Very good, sir."
+
+Though Philip clung to her, she released herself gently. Mr. Carey took
+the boy on his knee and put his arm round him.
+
+"You mustn't cry," he said. "You're too old to have a nurse now. We must
+see about sending you to school."
+
+"I want Emma to come with me," the child repeated.
+
+"It costs too much money, Philip. Your father didn't leave very much, and
+I don't know what's become of it. You must look at every penny you spend."
+
+Mr. Carey had called the day before on the family solicitor. Philip's
+father was a surgeon in good practice, and his hospital appointments
+suggested an established position; so that it was a surprise on his sudden
+death from blood-poisoning to find that he had left his widow little more
+than his life insurance and what could be got for the lease of their house
+in Bruton Street. This was six months ago; and Mrs. Carey, already in
+delicate health, finding herself with child, had lost her head and
+accepted for the lease the first offer that was made. She stored her
+furniture, and, at a rent which the parson thought outrageous, took a
+furnished house for a year, so that she might suffer from no inconvenience
+till her child was born. But she had never been used to the management of
+money, and was unable to adapt her expenditure to her altered
+circumstances. The little she had slipped through her fingers in one way
+and another, so that now, when all expenses were paid, not much more than
+two thousand pounds remained to support the boy till he was able to earn
+his own living. It was impossible to explain all this to Philip and he was
+sobbing still.
+
+"You'd better go to Emma," Mr. Carey said, feeling that she could console
+the child better than anyone.
+
+Without a word Philip slipped off his uncle's knee, but Mr. Carey stopped
+him.
+
+"We must go tomorrow, because on Saturday I've got to prepare my sermon,
+and you must tell Emma to get your things ready today. You can bring all
+your toys. And if you want anything to remember your father and mother by
+you can take one thing for each of them. Everything else is going to be
+sold."
+
+The boy slipped out of the room. Mr. Carey was unused to work, and he
+turned to his correspondence with resentment. On one side of the desk was
+a bundle of bills, and these filled him with irritation. One especially
+seemed preposterous. Immediately after Mrs. Carey's death Emma had ordered
+from the florist masses of white flowers for the room in which the dead
+woman lay. It was sheer waste of money. Emma took far too much upon
+herself. Even if there had been no financial necessity, he would have
+dismissed her.
+
+But Philip went to her, and hid his face in her bosom, and wept as though
+his heart would break. And she, feeling that he was almost her own
+son--she had taken him when he was a month old--consoled him with soft
+words. She promised that she would come and see him sometimes, and that
+she would never forget him; and she told him about the country he was
+going to and about her own home in Devonshire--her father kept a turnpike
+on the high-road that led to Exeter, and there were pigs in the sty, and
+there was a cow, and the cow had just had a calf--till Philip forgot his
+tears and grew excited at the thought of his approaching journey.
+Presently she put him down, for there was much to be done, and he helped
+her to lay out his clothes on the bed. She sent him into the nursery to
+gather up his toys, and in a little while he was playing happily.
+
+But at last he grew tired of being alone and went back to the bed-room, in
+which Emma was now putting his things into a big tin box; he remembered
+then that his uncle had said he might take something to remember his
+father and mother by. He told Emma and asked her what he should take.
+
+"You'd better go into the drawing-room and see what you fancy."
+
+"Uncle William's there."
+
+"Never mind that. They're your own things now."
+
+Philip went downstairs slowly and found the door open. Mr. Carey had left
+the room. Philip walked slowly round. They had been in the house so short
+a time that there was little in it that had a particular interest to him.
+It was a stranger's room, and Philip saw nothing that struck his fancy.
+But he knew which were his mother's things and which belonged to the
+landlord, and presently fixed on a little clock that he had once heard his
+mother say she liked. With this he walked again rather disconsolately
+upstairs. Outside the door of his mother's bed-room he stopped and
+listened. Though no one had told him not to go in, he had a feeling that
+it would be wrong to do so; he was a little frightened, and his heart beat
+uncomfortably; but at the same time something impelled him to turn the
+handle. He turned it very gently, as if to prevent anyone within from
+hearing, and then slowly pushed the door open. He stood on the threshold
+for a moment before he had the courage to enter. He was not frightened
+now, but it seemed strange. He closed the door behind him. The blinds were
+drawn, and the room, in the cold light of a January afternoon, was dark.
+On the dressing-table were Mrs. Carey's brushes and the hand mirror. In a
+little tray were hairpins. There was a photograph of himself on the
+chimney-piece and one of his father. He had often been in the room when
+his mother was not in it, but now it seemed different. There was something
+curious in the look of the chairs. The bed was made as though someone were
+going to sleep in it that night, and in a case on the pillow was a
+night-dress.
+
+Philip opened a large cupboard filled with dresses and, stepping in, took
+as many of them as he could in his arms and buried his face in them. They
+smelt of the scent his mother used. Then he pulled open the drawers,
+filled with his mother's things, and looked at them: there were lavender
+bags among the linen, and their scent was fresh and pleasant. The
+strangeness of the room left it, and it seemed to him that his mother had
+just gone out for a walk. She would be in presently and would come
+upstairs to have nursery tea with him. And he seemed to feel her kiss on
+his lips.
+
+It was not true that he would never see her again. It was not true simply
+because it was impossible. He climbed up on the bed and put his head on
+the pillow. He lay there quite still.
+
+
+
+IV
+
+
+Philip parted from Emma with tears, but the journey to Blackstable amused
+him, and, when they arrived, he was resigned and cheerful. Blackstable was
+sixty miles from London. Giving their luggage to a porter, Mr. Carey set
+out to walk with Philip to the vicarage; it took them little more than
+five minutes, and, when they reached it, Philip suddenly remembered the
+gate. It was red and five-barred: it swung both ways on easy hinges; and
+it was possible, though forbidden, to swing backwards and forwards on it.
+They walked through the garden to the front-door. This was only used by
+visitors and on Sundays, and on special occasions, as when the Vicar went
+up to London or came back. The traffic of the house took place through a
+side-door, and there was a back door as well for the gardener and for
+beggars and tramps. It was a fairly large house of yellow brick, with a
+red roof, built about five and twenty years before in an ecclesiastical
+style. The front-door was like a church porch, and the drawing-room
+windows were gothic.
+
+Mrs. Carey, knowing by what train they were coming, waited in the
+drawing-room and listened for the click of the gate. When she heard it she
+went to the door.
+
+"There's Aunt Louisa," said Mr. Carey, when he saw her. "Run and give her
+a kiss."
+
+Philip started to run, awkwardly, trailing his club-foot, and then
+stopped. Mrs. Carey was a little, shrivelled woman of the same age as her
+husband, with a face extraordinarily filled with deep wrinkles, and pale
+blue eyes. Her gray hair was arranged in ringlets according to the fashion
+of her youth. She wore a black dress, and her only ornament was a gold
+chain, from which hung a cross. She had a shy manner and a gentle voice.
+
+"Did you walk, William?" she said, almost reproachfully, as she kissed her
+husband.
+
+"I didn't think of it," he answered, with a glance at his nephew.
+
+"It didn't hurt you to walk, Philip, did it?" she asked the child.
+
+"No. I always walk."
+
+He was a little surprised at their conversation. Aunt Louisa told him to
+come in, and they entered the hall. It was paved with red and yellow
+tiles, on which alternately were a Greek Cross and the Lamb of God. An
+imposing staircase led out of the hall. It was of polished pine, with a
+peculiar smell, and had been put in because fortunately, when the church
+was reseated, enough wood remained over. The balusters were decorated with
+emblems of the Four Evangelists.
+
+"I've had the stove lighted as I thought you'd be cold after your
+journey," said Mrs. Carey.
+
+It was a large black stove that stood in the hall and was only lighted if
+the weather was very bad and the Vicar had a cold. It was not lighted if
+Mrs. Carey had a cold. Coal was expensive. Besides, Mary Ann, the maid,
+didn't like fires all over the place. If they wanted all them fires they
+must keep a second girl. In the winter Mr. and Mrs. Carey lived in the
+dining-room so that one fire should do, and in the summer they could not
+get out of the habit, so the drawing-room was used only by Mr. Carey on
+Sunday afternoons for his nap. But every Saturday he had a fire in the
+study so that he could write his sermon.
+
+Aunt Louisa took Philip upstairs and showed him into a tiny bed-room that
+looked out on the drive. Immediately in front of the window was a large
+tree, which Philip remembered now because the branches were so low that it
+was possible to climb quite high up it.
+
+"A small room for a small boy," said Mrs. Carey. "You won't be frightened
+at sleeping alone?"
+
+"Oh, no."
+
+On his first visit to the vicarage he had come with his nurse, and Mrs.
+Carey had had little to do with him. She looked at him now with some
+uncertainty.
+
+"Can you wash your own hands, or shall I wash them for you?"
+
+"I can wash myself," he answered firmly.
+
+"Well, I shall look at them when you come down to tea," said Mrs. Carey.
+
+She knew nothing about children. After it was settled that Philip should
+come down to Blackstable, Mrs. Carey had thought much how she should treat
+him; she was anxious to do her duty; but now he was there she found
+herself just as shy of him as he was of her. She hoped he would not be
+noisy and rough, because her husband did not like rough and noisy boys.
+Mrs. Carey made an excuse to leave Philip alone, but in a moment came back
+and knocked at the door; she asked him, without coming in, if he could
+pour out the water himself. Then she went downstairs and rang the bell for
+tea.
+
+The dining-room, large and well-proportioned, had windows on two sides of
+it, with heavy curtains of red rep; there was a big table in the middle;
+and at one end an imposing mahogany sideboard with a looking-glass in it.
+In one corner stood a harmonium. On each side of the fireplace were chairs
+covered in stamped leather, each with an antimacassar; one had arms and
+was called the husband, and the other had none and was called the wife.
+Mrs. Carey never sat in the arm-chair: she said she preferred a chair that
+was not too comfortable; there was always a lot to do, and if her chair
+had had arms she might not be so ready to leave it.
+
+Mr. Carey was making up the fire when Philip came in, and he pointed out
+to his nephew that there were two pokers. One was large and bright and
+polished and unused, and was called the Vicar; and the other, which was
+much smaller and had evidently passed through many fires, was called the
+Curate.
+
+"What are we waiting for?" said Mr. Carey.
+
+"I told Mary Ann to make you an egg. I thought you'd be hungry after your
+journey."
+
+Mrs. Carey thought the journey from London to Blackstable very tiring. She
+seldom travelled herself, for the living was only three hundred a year,
+and, when her husband wanted a holiday, since there was not money for two,
+he went by himself. He was very fond of Church Congresses and usually
+managed to go up to London once a year; and once he had been to Paris for
+the exhibition, and two or three times to Switzerland. Mary Ann brought in
+the egg, and they sat down. The chair was much too low for Philip, and for
+a moment neither Mr. Carey nor his wife knew what to do.
+
+"I'll put some books under him," said Mary Ann.
+
+She took from the top of the harmonium the large Bible and the prayer-book
+from which the Vicar was accustomed to read prayers, and put them on
+Philip's chair.
+
+"Oh, William, he can't sit on the Bible," said Mrs. Carey, in a shocked
+tone. "Couldn't you get him some books out of the study?"
+
+Mr. Carey considered the question for an instant.
+
+"I don't think it matters this once if you put the prayer-book on the top,
+Mary Ann," he said. "The book of Common Prayer is the composition of men
+like ourselves. It has no claim to divine authorship."
+
+"I hadn't thought of that, William," said Aunt Louisa.
+
+Philip perched himself on the books, and the Vicar, having said grace, cut
+the top off his egg.
+
+"There," he said, handing it to Philip, "you can eat my top if you like."
+
+Philip would have liked an egg to himself, but he was not offered one, so
+took what he could.
+
+"How have the chickens been laying since I went away?" asked the Vicar.
+
+"Oh, they've been dreadful, only one or two a day."
+
+"How did you like that top, Philip?" asked his uncle.
+
+"Very much, thank you."
+
+"You shall have another one on Sunday afternoon."
+
+Mr. Carey always had a boiled egg at tea on Sunday, so that he might be
+fortified for the evening service.
+
+
+
+V
+
+
+Philip came gradually to know the people he was to live with, and by
+fragments of conversation, some of it not meant for his ears, learned a
+good deal both about himself and about his dead parents. Philip's father
+had been much younger than the Vicar of Blackstable. After a brilliant
+career at St. Luke's Hospital he was put on the staff, and presently began
+to earn money in considerable sums. He spent it freely. When the parson
+set about restoring his church and asked his brother for a subscription,
+he was surprised by receiving a couple of hundred pounds: Mr. Carey,
+thrifty by inclination and economical by necessity, accepted it with
+mingled feelings; he was envious of his brother because he could afford to
+give so much, pleased for the sake of his church, and vaguely irritated by
+a generosity which seemed almost ostentatious. Then Henry Carey married a
+patient, a beautiful girl but penniless, an orphan with no near relations,
+but of good family; and there was an array of fine friends at the wedding.
+The parson, on his visits to her when he came to London, held himself with
+reserve. He felt shy with her and in his heart he resented her great
+beauty: she dressed more magnificently than became the wife of a
+hardworking surgeon; and the charming furniture of her house, the flowers
+among which she lived even in winter, suggested an extravagance which he
+deplored. He heard her talk of entertainments she was going to; and, as he
+told his wife on getting home again, it was impossible to accept
+hospitality without making some return. He had seen grapes in the
+dining-room that must have cost at least eight shillings a pound; and at
+luncheon he had been given asparagus two months before it was ready in the
+vicarage garden. Now all he had anticipated was come to pass: the Vicar
+felt the satisfaction of the prophet who saw fire and brimstone consume
+the city which would not mend its way to his warning. Poor Philip was
+practically penniless, and what was the good of his mother's fine friends
+now? He heard that his father's extravagance was really criminal, and it
+was a mercy that Providence had seen fit to take his dear mother to
+itself: she had no more idea of money than a child.
+
+When Philip had been a week at Blackstable an incident happened which
+seemed to irritate his uncle very much. One morning he found on the
+breakfast table a small packet which had been sent on by post from the
+late Mrs. Carey's house in London. It was addressed to her. When the
+parson opened it he found a dozen photographs of Mrs. Carey. They showed
+the head and shoulders only, and her hair was more plainly done than
+usual, low on the forehead, which gave her an unusual look; the face was
+thin and worn, but no illness could impair the beauty of her features.
+There was in the large dark eyes a sadness which Philip did not remember.
+The first sight of the dead woman gave Mr. Carey a little shock, but this
+was quickly followed by perplexity. The photographs seemed quite recent,
+and he could not imagine who had ordered them.
+
+"D'you know anything about these, Philip?" he asked.
+
+"I remember mamma said she'd been taken," he answered. "Miss Watkin
+scolded her.... She said: I wanted the boy to have something to remember
+me by when he grows up."
+
+Mr. Carey looked at Philip for an instant. The child spoke in a clear
+treble. He recalled the words, but they meant nothing to him.
+
+"You'd better take one of the photographs and keep it in your room," said
+Mr. Carey. "I'll put the others away."
+
+He sent one to Miss Watkin, and she wrote and explained how they came to
+be taken.
+
+One day Mrs. Carey was lying in bed, but she was feeling a little better
+than usual, and the doctor in the morning had seemed hopeful; Emma had
+taken the child out, and the maids were downstairs in the basement:
+suddenly Mrs. Carey felt desperately alone in the world. A great fear
+seized her that she would not recover from the confinement which she was
+expecting in a fortnight. Her son was nine years old. How could he be
+expected to remember her? She could not bear to think that he would grow
+up and forget, forget her utterly; and she had loved him so passionately,
+because he was weakly and deformed, and because he was her child. She had
+no photographs of herself taken since her marriage, and that was ten years
+before. She wanted her son to know what she looked like at the end. He
+could not forget her then, not forget utterly. She knew that if she called
+her maid and told her she wanted to get up, the maid would prevent her,
+and perhaps send for the doctor, and she had not the strength now to
+struggle or argue. She got out of bed and began to dress herself. She had
+been on her back so long that her legs gave way beneath her, and then the
+soles of her feet tingled so that she could hardly bear to put them to the
+ground. But she went on. She was unused to doing her own hair and, when
+she raised her arms and began to brush it, she felt faint. She could never
+do it as her maid did. It was beautiful hair, very fine, and of a deep
+rich gold. Her eyebrows were straight and dark. She put on a black skirt,
+but chose the bodice of the evening dress which she liked best: it was of
+a white damask which was fashionable in those days. She looked at herself
+in the glass. Her face was very pale, but her skin was clear: she had
+never had much colour, and this had always made the redness of her
+beautiful mouth emphatic. She could not restrain a sob. But she could not
+afford to be sorry for herself; she was feeling already desperately tired;
+and she put on the furs which Henry had given her the Christmas
+before--she had been so proud of them and so happy then--and slipped
+downstairs with beating heart. She got safely out of the house and drove
+to a photographer. She paid for a dozen photographs. She was obliged to
+ask for a glass of water in the middle of the sitting; and the assistant,
+seeing she was ill, suggested that she should come another day, but she
+insisted on staying till the end. At last it was finished, and she drove
+back again to the dingy little house in Kensington which she hated with
+all her heart. It was a horrible house to die in.
+
+She found the front door open, and when she drove up the maid and Emma ran
+down the steps to help her. They had been frightened when they found her
+room empty. At first they thought she must have gone to Miss Watkin, and
+the cook was sent round. Miss Watkin came back with her and was waiting
+anxiously in the drawing-room. She came downstairs now full of anxiety and
+reproaches; but the exertion had been more than Mrs. Carey was fit for,
+and when the occasion for firmness no longer existed she gave way. She
+fell heavily into Emma's arms and was carried upstairs. She remained
+unconscious for a time that seemed incredibly long to those that watched
+her, and the doctor, hurriedly sent for, did not come. It was next day,
+when she was a little better, that Miss Watkin got some explanation out of
+her. Philip was playing on the floor of his mother's bed-room, and neither
+of the ladies paid attention to him. He only understood vaguely what they
+were talking about, and he could not have said why those words remained in
+his memory.
+
+"I wanted the boy to have something to remember me by when he grows up."
+
+"I can't make out why she ordered a dozen," said Mr. Carey. "Two would
+have done."
+
+
+
+VI
+
+
+One day was very like another at the vicarage.
+
+Soon after breakfast Mary Ann brought in The Times. Mr. Carey shared it
+with two neighbours. He had it from ten till one, when the gardener took
+it over to Mr. Ellis at the Limes, with whom it remained till seven; then
+it was taken to Miss Brooks at the Manor House, who, since she got it
+late, had the advantage of keeping it. In summer Mrs. Carey, when she was
+making jam, often asked her for a copy to cover the pots with. When the
+Vicar settled down to his paper his wife put on her bonnet and went out to
+do the shopping. Philip accompanied her. Blackstable was a fishing
+village. It consisted of a high street in which were the shops, the bank,
+the doctor's house, and the houses of two or three coalship owners; round
+the little harbor were shabby streets in which lived fishermen and poor
+people; but since they went to chapel they were of no account. When Mrs.
+Carey passed the dissenting ministers in the street she stepped over to
+the other side to avoid meeting them, but if there was not time for this
+fixed her eyes on the pavement. It was a scandal to which the Vicar had
+never resigned himself that there were three chapels in the High Street:
+he could not help feeling that the law should have stepped in to prevent
+their erection. Shopping in Blackstable was not a simple matter; for
+dissent, helped by the fact that the parish church was two miles from the
+town, was very common; and it was necessary to deal only with churchgoers;
+Mrs. Carey knew perfectly that the vicarage custom might make all the
+difference to a tradesman's faith. There were two butchers who went to
+church, and they would not understand that the Vicar could not deal with
+both of them at once; nor were they satisfied with his simple plan of
+going for six months to one and for six months to the other. The butcher
+who was not sending meat to the vicarage constantly threatened not to come
+to church, and the Vicar was sometimes obliged to make a threat: it was
+very wrong of him not to come to church, but if he carried iniquity
+further and actually went to chapel, then of course, excellent as his meat
+was, Mr. Carey would be forced to leave him for ever. Mrs. Carey often
+stopped at the bank to deliver a message to Josiah Graves, the manager,
+who was choir-master, treasurer, and churchwarden. He was a tall, thin man
+with a sallow face and a long nose; his hair was very white, and to Philip
+he seemed extremely old. He kept the parish accounts, arranged the treats
+for the choir and the schools; though there was no organ in the parish
+church, it was generally considered (in Blackstable) that the choir he led
+was the best in Kent; and when there was any ceremony, such as a visit
+from the Bishop for confirmation or from the Rural Dean to preach at the
+Harvest Thanksgiving, he made the necessary preparations. But he had no
+hesitation in doing all manner of things without more than a perfunctory
+consultation with the Vicar, and the Vicar, though always ready to be
+saved trouble, much resented the churchwarden's managing ways. He really
+seemed to look upon himself as the most important person in the parish.
+Mr. Carey constantly told his wife that if Josiah Graves did not take care
+he would give him a good rap over the knuckles one day; but Mrs. Carey
+advised him to bear with Josiah Graves: he meant well, and it was not his
+fault if he was not quite a gentleman. The Vicar, finding his comfort in
+the practice of a Christian virtue, exercised forbearance; but he revenged
+himself by calling the churchwarden Bismarck behind his back.
+
+Once there had been a serious quarrel between the pair, and Mrs. Carey
+still thought of that anxious time with dismay. The Conservative candidate
+had announced his intention of addressing a meeting at Blackstable; and
+Josiah Graves, having arranged that it should take place in the Mission
+Hall, went to Mr. Carey and told him that he hoped he would say a few
+words. It appeared that the candidate had asked Josiah Graves to take the
+chair. This was more than Mr. Carey could put up with. He had firm views
+upon the respect which was due to the cloth, and it was ridiculous for a
+churchwarden to take the chair at a meeting when the Vicar was there. He
+reminded Josiah Graves that parson meant person, that is, the vicar was
+the person of the parish. Josiah Graves answered that he was the first to
+recognise the dignity of the church, but this was a matter of politics,
+and in his turn he reminded the Vicar that their Blessed Saviour had
+enjoined upon them to render unto Caesar the things that were Caesar's. To
+this Mr. Carey replied that the devil could quote scripture to his
+purpose, himself had sole authority over the Mission Hall, and if he were
+not asked to be chairman he would refuse the use of it for a political
+meeting. Josiah Graves told Mr. Carey that he might do as he chose, and
+for his part he thought the Wesleyan Chapel would be an equally suitable
+place. Then Mr. Carey said that if Josiah Graves set foot in what was
+little better than a heathen temple he was not fit to be churchwarden in
+a Christian parish. Josiah Graves thereupon resigned all his offices, and
+that very evening sent to the church for his cassock and surplice. His
+sister, Miss Graves, who kept house for him, gave up her secretaryship of
+the Maternity Club, which provided the pregnant poor with flannel, baby
+linen, coals, and five shillings. Mr. Carey said he was at last master in
+his own house. But soon he found that he was obliged to see to all sorts
+of things that he knew nothing about; and Josiah Graves, after the first
+moment of irritation, discovered that he had lost his chief interest in
+life. Mrs. Carey and Miss Graves were much distressed by the quarrel; they
+met after a discreet exchange of letters, and made up their minds to put
+the matter right: they talked, one to her husband, the other to her
+brother, from morning till night; and since they were persuading these
+gentlemen to do what in their hearts they wanted, after three weeks of
+anxiety a reconciliation was effected. It was to both their interests, but
+they ascribed it to a common love for their Redeemer. The meeting was held
+at the Mission Hall, and the doctor was asked to be chairman. Mr. Carey
+and Josiah Graves both made speeches.
+
+When Mrs. Carey had finished her business with the banker, she generally
+went upstairs to have a little chat with his sister; and while the ladies
+talked of parish matters, the curate or the new bonnet of Mrs. Wilson--Mr.
+Wilson was the richest man in Blackstable, he was thought to have at least
+five hundred a year, and he had married his cook--Philip sat demurely in
+the stiff parlour, used only to receive visitors, and busied himself with
+the restless movements of goldfish in a bowl. The windows were never
+opened except to air the room for a few minutes in the morning, and it had
+a stuffy smell which seemed to Philip to have a mysterious connection with
+banking.
+
+Then Mrs. Carey remembered that she had to go to the grocer, and they
+continued their way. When the shopping was done they often went down a
+side street of little houses, mostly of wood, in which fishermen dwelt
+(and here and there a fisherman sat on his doorstep mending his nets, and
+nets hung to dry upon the doors), till they came to a small beach, shut in
+on each side by warehouses, but with a view of the sea. Mrs. Carey stood
+for a few minutes and looked at it, it was turbid and yellow, [and who
+knows what thoughts passed through her mind?] while Philip searched for
+flat stones to play ducks and drakes. Then they walked slowly back. They
+looked into the post office to get the right time, nodded to Mrs. Wigram
+the doctor's wife, who sat at her window sewing, and so got home.
+
+Dinner was at one o'clock; and on Monday, Tuesday, and Wednesday it
+consisted of beef, roast, hashed, and minced, and on Thursday, Friday, and
+Saturday of mutton. On Sunday they ate one of their own chickens. In the
+afternoon Philip did his lessons, He was taught Latin and mathematics by
+his uncle who knew neither, and French and the piano by his aunt. Of
+French she was ignorant, but she knew the piano well enough to accompany
+the old-fashioned songs she had sung for thirty years. Uncle William used
+to tell Philip that when he was a curate his wife had known twelve songs
+by heart, which she could sing at a moment's notice whenever she was
+asked. She often sang still when there was a tea-party at the vicarage.
+There were few people whom the Careys cared to ask there, and their
+parties consisted always of the curate, Josiah Graves with his sister, Dr.
+Wigram and his wife. After tea Miss Graves played one or two of
+Mendelssohn's Songs without Words, and Mrs. Carey sang When the
+Swallows Homeward Fly, or Trot, Trot, My Pony.
+
+But the Careys did not give tea-parties often; the preparations upset
+them, and when their guests were gone they felt themselves exhausted. They
+preferred to have tea by themselves, and after tea they played backgammon.
+Mrs. Carey arranged that her husband should win, because he did not like
+losing. They had cold supper at eight. It was a scrappy meal because Mary
+Ann resented getting anything ready after tea, and Mrs. Carey helped to
+clear away. Mrs. Carey seldom ate more than bread and butter, with a
+little stewed fruit to follow, but the Vicar had a slice of cold meat.
+Immediately after supper Mrs. Carey rang the bell for prayers, and then
+Philip went to bed. He rebelled against being undressed by Mary Ann and
+after a while succeeded in establishing his right to dress and undress
+himself. At nine o'clock Mary Ann brought in the eggs and the plate. Mrs.
+Carey wrote the date on each egg and put the number down in a book. She
+then took the plate-basket on her arm and went upstairs. Mr. Carey
+continued to read one of his old books, but as the clock struck ten he got
+up, put out the lamps, and followed his wife to bed.
+
+When Philip arrived there was some difficulty in deciding on which evening
+he should have his bath. It was never easy to get plenty of hot water,
+since the kitchen boiler did not work, and it was impossible for two
+persons to have a bath on the same day. The only man who had a bathroom in
+Blackstable was Mr. Wilson, and it was thought ostentatious of him. Mary
+Ann had her bath in the kitchen on Monday night, because she liked to
+begin the week clean. Uncle William could not have his on Saturday,
+because he had a heavy day before him and he was always a little tired
+after a bath, so he had it on Friday. Mrs. Carey had hers on Thursday for
+the same reason. It looked as though Saturday were naturally indicated for
+Philip, but Mary Ann said she couldn't keep the fire up on Saturday night:
+what with all the cooking on Sunday, having to make pastry and she didn't
+know what all, she did not feel up to giving the boy his bath on Saturday
+night; and it was quite clear that he could not bath himself. Mrs. Carey
+was shy about bathing a boy, and of course the Vicar had his sermon. But
+the Vicar insisted that Philip should be clean and sweet for the lord's
+Day. Mary Ann said she would rather go than be put upon--and after
+eighteen years she didn't expect to have more work given her, and they
+might show some consideration--and Philip said he didn't want anyone to
+bath him, but could very well bath himself. This settled it. Mary Ann said
+she was quite sure he wouldn't bath himself properly, and rather than he
+should go dirty--and not because he was going into the presence of the
+Lord, but because she couldn't abide a boy who wasn't properly
+washed--she'd work herself to the bone even if it was Saturday night.
+
+
+
+VII
+
+
+Sunday was a day crowded with incident. Mr. Carey was accustomed to say
+that he was the only man in his parish who worked seven days a week.
+
+The household got up half an hour earlier than usual. No lying abed for a
+poor parson on the day of rest, Mr. Carey remarked as Mary Ann knocked at
+the door punctually at eight. It took Mrs. Carey longer to dress, and she
+got down to breakfast at nine, a little breathless, only just before her
+husband. Mr. Carey's boots stood in front of the fire to warm. Prayers
+were longer than usual, and the breakfast more substantial. After
+breakfast the Vicar cut thin slices of bread for the communion, and Philip
+was privileged to cut off the crust. He was sent to the study to fetch a
+marble paperweight, with which Mr. Carey pressed the bread till it was
+thin and pulpy, and then it was cut into small squares. The amount was
+regulated by the weather. On a very bad day few people came to church, and
+on a very fine one, though many came, few stayed for communion. There were
+most when it was dry enough to make the walk to church pleasant, but not
+so fine that people wanted to hurry away.
+
+Then Mrs. Carey brought the communion plate out of the safe, which stood
+in the pantry, and the Vicar polished it with a chamois leather. At ten
+the fly drove up, and Mr. Carey got into his boots. Mrs. Carey took
+several minutes to put on her bonnet, during which the Vicar, in a
+voluminous cloak, stood in the hall with just such an expression on his
+face as would have become an early Christian about to be led into the
+arena. It was extraordinary that after thirty years of marriage his wife
+could not be ready in time on Sunday morning. At last she came, in black
+satin; the Vicar did not like colours in a clergyman's wife at any time,
+but on Sundays he was determined that she should wear black; now and then,
+in conspiracy with Miss Graves, she ventured a white feather or a pink
+rose in her bonnet, but the Vicar insisted that it should disappear; he
+said he would not go to church with the scarlet woman: Mrs. Carey sighed
+as a woman but obeyed as a wife. They were about to step into the carriage
+when the Vicar remembered that no one had given him his egg. They knew
+that he must have an egg for his voice, there were two women in the house,
+and no one had the least regard for his comfort. Mrs. Carey scolded Mary
+Ann, and Mary Ann answered that she could not think of everything. She
+hurried away to fetch an egg, and Mrs. Carey beat it up in a glass of
+sherry. The Vicar swallowed it at a gulp. The communion plate was stowed
+in the carriage, and they set off.
+
+The fly came from The Red Lion and had a peculiar smell of stale straw.
+They drove with both windows closed so that the Vicar should not catch
+cold. The sexton was waiting at the porch to take the communion plate, and
+while the Vicar went to the vestry Mrs. Carey and Philip settled
+themselves in the vicarage pew. Mrs. Carey placed in front of her the
+sixpenny bit she was accustomed to put in the plate, and gave Philip
+threepence for the same purpose. The church filled up gradually and the
+service began.
+
+Philip grew bored during the sermon, but if he fidgetted Mrs. Carey put a
+gentle hand on his arm and looked at him reproachfully. He regained
+interest when the final hymn was sung and Mr. Graves passed round with the
+plate.
+
+When everyone had gone Mrs. Carey went into Miss Graves' pew to have a few
+words with her while they were waiting for the gentlemen, and Philip went
+to the vestry. His uncle, the curate, and Mr. Graves were still in their
+surplices. Mr. Carey gave him the remains of the consecrated bread and
+told him he might eat it. He had been accustomed to eat it himself, as it
+seemed blasphemous to throw it away, but Philip's keen appetite relieved
+him from the duty. Then they counted the money. It consisted of pennies,
+sixpences and threepenny bits. There were always two single shillings, one
+put in the plate by the Vicar and the other by Mr. Graves; and sometimes
+there was a florin. Mr. Graves told the Vicar who had given this. It was
+always a stranger to Blackstable, and Mr. Carey wondered who he was. But
+Miss Graves had observed the rash act and was able to tell Mrs. Carey that
+the stranger came from London, was married and had children. During the
+drive home Mrs. Carey passed the information on, and the Vicar made up his
+mind to call on him and ask for a subscription to the Additional Curates
+Society. Mr. Carey asked if Philip had behaved properly; and Mrs. Carey
+remarked that Mrs. Wigram had a new mantle, Mr. Cox was not in church, and
+somebody thought that Miss Phillips was engaged. When they reached the
+vicarage they all felt that they deserved a substantial dinner.
+
+When this was over Mrs. Carey went to her room to rest, and Mr. Carey lay
+down on the sofa in the drawing-room for forty winks.
+
+They had tea at five, and the Vicar ate an egg to support himself for
+evensong. Mrs. Carey did not go to this so that Mary Ann might, but she
+read the service through and the hymns. Mr. Carey walked to church in the
+evening, and Philip limped along by his side. The walk through the
+darkness along the country road strangely impressed him, and the church
+with all its lights in the distance, coming gradually nearer, seemed very
+friendly. At first he was shy with his uncle, but little by little grew
+used to him, and he would slip his hand in his uncle's and walk more
+easily for the feeling of protection.
+
+They had supper when they got home. Mr. Carey's slippers were waiting for
+him on a footstool in front of the fire and by their side Philip's, one
+the shoe of a small boy, the other misshapen and odd. He was dreadfully
+tired when he went up to bed, and he did not resist when Mary Ann
+undressed him. She kissed him after she tucked him up, and he began to
+love her.
+
+
+
+VIII
+
+
+Philip had led always the solitary life of an only child, and his
+loneliness at the vicarage was no greater than it had been when his mother
+lived. He made friends with Mary Ann. She was a chubby little person of
+thirty-five, the daughter of a fisherman, and had come to the vicarage at
+eighteen; it was her first place and she had no intention of leaving it;
+but she held a possible marriage as a rod over the timid heads of her
+master and mistress. Her father and mother lived in a little house off
+Harbour Street, and she went to see them on her evenings out. Her stories
+of the sea touched Philip's imagination, and the narrow alleys round the
+harbour grew rich with the romance which his young fancy lent them. One
+evening he asked whether he might go home with her; but his aunt was
+afraid that he might catch something, and his uncle said that evil
+communications corrupted good manners. He disliked the fisher folk, who
+were rough, uncouth, and went to chapel. But Philip was more comfortable
+in the kitchen than in the dining-room, and, whenever he could, he took
+his toys and played there. His aunt was not sorry. She did not like
+disorder, and though she recognised that boys must be expected to be
+untidy she preferred that he should make a mess in the kitchen. If he
+fidgeted his uncle was apt to grow restless and say it was high time he
+went to school. Mrs. Carey thought Philip very young for this, and her
+heart went out to the motherless child; but her attempts to gain his
+affection were awkward, and the boy, feeling shy, received her
+demonstrations with so much sullenness that she was mortified. Sometimes
+she heard his shrill voice raised in laughter in the kitchen, but when she
+went in, he grew suddenly silent, and he flushed darkly when Mary Ann
+explained the joke. Mrs. Carey could not see anything amusing in what she
+heard, and she smiled with constraint.
+
+"He seems happier with Mary Ann than with us, William," she said, when she
+returned to her sewing.
+
+"One can see he's been very badly brought up. He wants licking into
+shape."
+
+On the second Sunday after Philip arrived an unlucky incident occurred.
+Mr. Carey had retired as usual after dinner for a little snooze in the
+drawing-room, but he was in an irritable mood and could not sleep. Josiah
+Graves that morning had objected strongly to some candlesticks with which
+the Vicar had adorned the altar. He had bought them second-hand in
+Tercanbury, and he thought they looked very well. But Josiah Graves said
+they were popish. This was a taunt that always aroused the Vicar. He had
+been at Oxford during the movement which ended in the secession from the
+Established Church of Edward Manning, and he felt a certain sympathy for
+the Church of Rome. He would willingly have made the service more ornate
+than had been usual in the low-church parish of Blackstable, and in his
+secret soul he yearned for processions and lighted candles. He drew the
+line at incense. He hated the word protestant. He called himself a
+Catholic. He was accustomed to say that Papists required an epithet, they
+were Roman Catholic; but the Church of England was Catholic in the best,
+the fullest, and the noblest sense of the term. He was pleased to think
+that his shaven face gave him the look of a priest, and in his youth he
+had possessed an ascetic air which added to the impression. He often
+related that on one of his holidays in Boulogne, one of those holidays
+upon which his wife for economy's sake did not accompany him, when he was
+sitting in a church, the cure had come up to him and invited him to
+preach a sermon. He dismissed his curates when they married, having
+decided views on the celibacy of the unbeneficed clergy. But when at an
+election the Liberals had written on his garden fence in large blue
+letters: This way to Rome, he had been very angry, and threatened to
+prosecute the leaders of the Liberal party in Blackstable. He made up his
+mind now that nothing Josiah Graves said would induce him to remove the
+candlesticks from the altar, and he muttered Bismarck to himself once or
+twice irritably.
+
+Suddenly he heard an unexpected noise. He pulled the handkerchief off his
+face, got up from the sofa on which he was lying, and went into the
+dining-room. Philip was seated on the table with all his bricks around
+him. He had built a monstrous castle, and some defect in the foundation
+had just brought the structure down in noisy ruin.
+
+"What are you doing with those bricks, Philip? You know you're not allowed
+to play games on Sunday."
+
+Philip stared at him for a moment with frightened eyes, and, as his habit
+was, flushed deeply.
+
+"I always used to play at home," he answered.
+
+"I'm sure your dear mamma never allowed you to do such a wicked thing as
+that."
+
+Philip did not know it was wicked; but if it was, he did not wish it to be
+supposed that his mother had consented to it. He hung his head and did not
+answer.
+
+"Don't you know it's very, very wicked to play on Sunday? What d'you
+suppose it's called the day of rest for? You're going to church tonight,
+and how can you face your Maker when you've been breaking one of His laws
+in the afternoon?"
+
+Mr. Carey told him to put the bricks away at once, and stood over him
+while Philip did so.
+
+"You're a very naughty boy," he repeated. "Think of the grief you're
+causing your poor mother in heaven."
+
+Philip felt inclined to cry, but he had an instinctive disinclination to
+letting other people see his tears, and he clenched his teeth to prevent
+the sobs from escaping. Mr. Carey sat down in his arm-chair and began to
+turn over the pages of a book. Philip stood at the window. The vicarage
+was set back from the highroad to Tercanbury, and from the dining-room one
+saw a semicircular strip of lawn and then as far as the horizon green
+fields. Sheep were grazing in them. The sky was forlorn and gray. Philip
+felt infinitely unhappy.
+
+Presently Mary Ann came in to lay the tea, and Aunt Louisa descended the
+stairs.
+
+"Have you had a nice little nap, William?" she asked.
+
+"No," he answered. "Philip made so much noise that I couldn't sleep a
+wink."
+
+This was not quite accurate, for he had been kept awake by his own
+thoughts; and Philip, listening sullenly, reflected that he had only made
+a noise once, and there was no reason why his uncle should not have slept
+before or after. When Mrs. Carey asked for an explanation the Vicar
+narrated the facts.
+
+"He hasn't even said he was sorry," he finished.
+
+"Oh, Philip, I'm sure you're sorry," said Mrs. Carey, anxious that the
+child should not seem wickeder to his uncle than need be.
+
+Philip did not reply. He went on munching his bread and butter. He did not
+know what power it was in him that prevented him from making any
+expression of regret. He felt his ears tingling, he was a little inclined
+to cry, but no word would issue from his lips.
+
+"You needn't make it worse by sulking," said Mr. Carey.
+
+Tea was finished in silence. Mrs. Carey looked at Philip surreptitiously
+now and then, but the Vicar elaborately ignored him. When Philip saw his
+uncle go upstairs to get ready for church he went into the hall and got
+his hat and coat, but when the Vicar came downstairs and saw him, he said:
+
+"I don't wish you to go to church tonight, Philip. I don't think you're in
+a proper frame of mind to enter the House of God."
+
+Philip did not say a word. He felt it was a deep humiliation that was
+placed upon him, and his cheeks reddened. He stood silently watching his
+uncle put on his broad hat and his voluminous cloak. Mrs. Carey as usual
+went to the door to see him off. Then she turned to Philip.
+
+"Never mind, Philip, you won't be a naughty boy next Sunday, will you, and
+then your uncle will take you to church with him in the evening."
+
+She took off his hat and coat, and led him into the dining-room.
+
+"Shall you and I read the service together, Philip, and we'll sing the
+hymns at the harmonium. Would you like that?"
+
+Philip shook his head decidedly. Mrs. Carey was taken aback. If he would
+not read the evening service with her she did not know what to do with
+him.
+
+"Then what would you like to do until your uncle comes back?" she asked
+helplessly.
+
+Philip broke his silence at last.
+
+"I want to be left alone," he said.
+
+"Philip, how can you say anything so unkind? Don't you know that your
+uncle and I only want your good? Don't you love me at all?"
+
+"I hate you. I wish you was dead."
+
+Mrs. Carey gasped. He said the words so savagely that it gave her quite a
+start. She had nothing to say. She sat down in her husband's chair; and as
+she thought of her desire to love the friendless, crippled boy and her
+eager wish that he should love her--she was a barren woman and, even
+though it was clearly God's will that she should be childless, she could
+scarcely bear to look at little children sometimes, her heart ached
+so--the tears rose to her eyes and one by one, slowly, rolled down her
+cheeks. Philip watched her in amazement. She took out her handkerchief,
+and now she cried without restraint. Suddenly Philip realised that she was
+crying because of what he had said, and he was sorry. He went up to her
+silently and kissed her. It was the first kiss he had ever given her
+without being asked. And the poor lady, so small in her black satin,
+shrivelled up and sallow, with her funny corkscrew curls, took the little
+boy on her lap and put her arms around him and wept as though her heart
+would break. But her tears were partly tears of happiness, for she felt
+that the strangeness between them was gone. She loved him now with a new
+love because he had made her suffer.
+
+
+
+IX
+
+
+On the following Sunday, when the Vicar was making his preparations to go
+into the drawing-room for his nap--all the actions of his life were
+conducted with ceremony--and Mrs. Carey was about to go upstairs, Philip
+asked:
+
+"What shall I do if I'm not allowed to play?"
+
+"Can't you sit still for once and be quiet?"
+
+"I can't sit still till tea-time."
+
+Mr. Carey looked out of the window, but it was cold and raw, and he could
+not suggest that Philip should go into the garden.
+
+"I know what you can do. You can learn by heart the collect for the day."
+
+He took the prayer-book which was used for prayers from the harmonium, and
+turned the pages till he came to the place he wanted.
+
+"It's not a long one. If you can say it without a mistake when I come in
+to tea you shall have the top of my egg."
+
+Mrs. Carey drew up Philip's chair to the dining-room table--they had
+bought him a high chair by now--and placed the book in front of him.
+
+"The devil finds work for idle hands to do," said Mr. Carey.
+
+He put some more coals on the fire so that there should be a cheerful
+blaze when he came in to tea, and went into the drawing-room. He loosened
+his collar, arranged the cushions, and settled himself comfortably on the
+sofa. But thinking the drawing-room a little chilly, Mrs. Carey brought
+him a rug from the hall; she put it over his legs and tucked it round his
+feet. She drew the blinds so that the light should not offend his eyes,
+and since he had closed them already went out of the room on tiptoe. The
+Vicar was at peace with himself today, and in ten minutes he was asleep.
+He snored softly.
+
+It was the Sixth Sunday after Epiphany, and the collect began with the
+words: O God, whose blessed Son was manifested that he might destroy the
+works of the devil, and make us the sons of God, and heirs of Eternal
+life. Philip read it through. He could make no sense of it. He began
+saying the words aloud to himself, but many of them were unknown to him,
+and the construction of the sentence was strange. He could not get more
+than two lines in his head. And his attention was constantly wandering:
+there were fruit trees trained on the walls of the vicarage, and a long
+twig beat now and then against the windowpane; sheep grazed stolidly in
+the field beyond the garden. It seemed as though there were knots inside
+his brain. Then panic seized him that he would not know the words by
+tea-time, and he kept on whispering them to himself quickly; he did not
+try to understand, but merely to get them parrot-like into his memory.
+
+Mrs. Carey could not sleep that afternoon, and by four o'clock she was so
+wide awake that she came downstairs. She thought she would hear Philip his
+collect so that he should make no mistakes when he said it to his uncle.
+His uncle then would be pleased; he would see that the boy's heart was in
+the right place. But when Mrs. Carey came to the dining-room and was about
+to go in, she heard a sound that made her stop suddenly. Her heart gave a
+little jump. She turned away and quietly slipped out of the front-door.
+She walked round the house till she came to the dining-room window and
+then cautiously looked in. Philip was still sitting on the chair she had
+put him in, but his head was on the table buried in his arms, and he was
+sobbing desperately. She saw the convulsive movement of his shoulders.
+Mrs. Carey was frightened. A thing that had always struck her about the
+child was that he seemed so collected. She had never seen him cry. And now
+she realised that his calmness was some instinctive shame of showing his
+fillings: he hid himself to weep.
+
+Without thinking that her husband disliked being wakened suddenly, she
+burst into the drawing-room.
+
+"William, William," she said. "The boy's crying as though his heart would
+break."
+
+Mr. Carey sat up and disentangled himself from the rug about his legs.
+
+"What's he got to cry about?"
+
+"I don't know.... Oh, William, we can't let the boy be unhappy. D'you
+think it's our fault? If we'd had children we'd have known what to do."
+
+Mr. Carey looked at her in perplexity. He felt extraordinarily helpless.
+
+"He can't be crying because I gave him the collect to learn. It's not more
+than ten lines."
+
+"Don't you think I might take him some picture books to look at, William?
+There are some of the Holy Land. There couldn't be anything wrong in
+that."
+
+"Very well, I don't mind."
+
+Mrs. Carey went into the study. To collect books was Mr. Carey's only
+passion, and he never went into Tercanbury without spending an hour or two
+in the second-hand shop; he always brought back four or five musty
+volumes. He never read them, for he had long lost the habit of reading,
+but he liked to turn the pages, look at the illustrations if they were
+illustrated, and mend the bindings. He welcomed wet days because on them
+he could stay at home without pangs of conscience and spend the afternoon
+with white of egg and a glue-pot, patching up the Russia leather of some
+battered quarto. He had many volumes of old travels, with steel
+engravings, and Mrs. Carey quickly found two which described Palestine.
+She coughed elaborately at the door so that Philip should have time to
+compose himself, she felt that he would be humiliated if she came upon him
+in the midst of his tears, then she rattled the door handle. When she went
+in Philip was poring over the prayer-book, hiding his eyes with his hands
+so that she might not see he had been crying.
+
+"Do you know the collect yet?" she said.
+
+He did not answer for a moment, and she felt that he did not trust his
+voice. She was oddly embarrassed.
+
+"I can't learn it by heart," he said at last, with a gasp.
+
+"Oh, well, never mind," she said. "You needn't. I've got some picture
+books for you to look at. Come and sit on my lap, and we'll look at them
+together."
+
+Philip slipped off his chair and limped over to her. He looked down so
+that she should not see his eyes. She put her arms round him.
+
+"Look," she said, "that's the place where our blessed Lord was born."
+
+She showed him an Eastern town with flat roofs and cupolas and minarets.
+In the foreground was a group of palm-trees, and under them were resting
+two Arabs and some camels. Philip passed his hand over the picture as if
+he wanted to feel the houses and the loose habiliments of the nomads.
+
+"Read what it says," he asked.
+
+Mrs. Carey in her even voice read the opposite page. It was a romantic
+narrative of some Eastern traveller of the thirties, pompous maybe, but
+fragrant with the emotion with which the East came to the generation that
+followed Byron and Chateaubriand. In a moment or two Philip interrupted
+her.
+
+"I want to see another picture."
+
+When Mary Ann came in and Mrs. Carey rose to help her lay the cloth.
+Philip took the book in his hands and hurried through the illustrations.
+It was with difficulty that his aunt induced him to put the book down for
+tea. He had forgotten his horrible struggle to get the collect by heart;
+he had forgotten his tears. Next day it was raining, and he asked for the
+book again. Mrs. Carey gave it him joyfully. Talking over his future with
+her husband she had found that both desired him to take orders, and this
+eagerness for the book which described places hallowed by the presence of
+Jesus seemed a good sign. It looked as though the boy's mind addressed
+itself naturally to holy things. But in a day or two he asked for more
+books. Mr. Carey took him into his study, showed him the shelf in which he
+kept illustrated works, and chose for him one that dealt with Rome. Philip
+took it greedily. The pictures led him to a new amusement. He began to
+read the page before and the page after each engraving to find out what it
+was about, and soon he lost all interest in his toys.
+
+Then, when no one was near, he took out books for himself; and perhaps
+because the first impression on his mind was made by an Eastern town, he
+found his chief amusement in those which described the Levant. His heart
+beat with excitement at the pictures of mosques and rich palaces; but
+there was one, in a book on Constantinople, which peculiarly stirred his
+imagination. It was called the Hall of the Thousand Columns. It was a
+Byzantine cistern, which the popular fancy had endowed with fantastic
+vastness; and the legend which he read told that a boat was always moored
+at the entrance to tempt the unwary, but no traveller venturing into the
+darkness had ever been seen again. And Philip wondered whether the boat
+went on for ever through one pillared alley after another or came at last
+to some strange mansion.
+
+One day a good fortune befell him, for he hit upon Lane's translation of
+The Thousand Nights and a Night. He was captured first by the
+illustrations, and then he began to read, to start with, the stories that
+dealt with magic, and then the others; and those he liked he read again
+and again. He could think of nothing else. He forgot the life about him.
+He had to be called two or three times before he would come to his dinner.
+Insensibly he formed the most delightful habit in the world, the habit of
+reading: he did not know that thus he was providing himself with a refuge
+from all the distress of life; he did not know either that he was creating
+for himself an unreal world which would make the real world of every day
+a source of bitter disappointment. Presently he began to read other
+things. His brain was precocious. His uncle and aunt, seeing that he
+occupied himself and neither worried nor made a noise, ceased to trouble
+themselves about him. Mr. Carey had so many books that he did not know
+them, and as he read little he forgot the odd lots he had bought at one
+time and another because they were cheap. Haphazard among the sermons and
+homilies, the travels, the lives of the Saints, the Fathers, the histories
+of the church, were old-fashioned novels; and these Philip at last
+discovered. He chose them by their titles, and the first he read was The
+Lancashire Witches, and then he read The Admirable Crichton, and then
+many more. Whenever he started a book with two solitary travellers riding
+along the brink of a desperate ravine he knew he was safe.
+
+The summer was come now, and the gardener, an old sailor, made him a
+hammock and fixed it up for him in the branches of a weeping willow. And
+here for long hours he lay, hidden from anyone who might come to the
+vicarage, reading, reading passionately. Time passed and it was July;
+August came: on Sundays the church was crowded with strangers, and the
+collection at the offertory often amounted to two pounds. Neither the
+Vicar nor Mrs. Carey went out of the garden much during this period; for
+they disliked strange faces, and they looked upon the visitors from London
+with aversion. The house opposite was taken for six weeks by a gentleman
+who had two little boys, and he sent in to ask if Philip would like to go
+and play with them; but Mrs. Carey returned a polite refusal. She was
+afraid that Philip would be corrupted by little boys from London. He was
+going to be a clergyman, and it was necessary that he should be preserved
+from contamination. She liked to see in him an infant Samuel.
+
+
+
+X
+
+
+The Careys made up their minds to send Philip to King's School at
+Tercanbury. The neighbouring clergy sent their sons there. It was united
+by long tradition to the Cathedral: its headmaster was an honorary Canon,
+and a past headmaster was the Archdeacon. Boys were encouraged there to
+aspire to Holy Orders, and the education was such as might prepare an
+honest lad to spend his life in God's service. A preparatory school was
+attached to it, and to this it was arranged that Philip should go. Mr.
+Carey took him into Tercanbury one Thursday afternoon towards the end of
+September. All day Philip had been excited and rather frightened. He knew
+little of school life but what he had read in the stories of The Boy's
+Own Paper. He had also read Eric, or Little by Little.
+
+When they got out of the train at Tercanbury, Philip felt sick with
+apprehension, and during the drive in to the town sat pale and silent. The
+high brick wall in front of the school gave it the look of a prison. There
+was a little door in it, which opened on their ringing; and a clumsy,
+untidy man came out and fetched Philip's tin trunk and his play-box. They
+were shown into the drawing-room; it was filled with massive, ugly
+furniture, and the chairs of the suite were placed round the walls with a
+forbidding rigidity. They waited for the headmaster.
+
+"What's Mr. Watson like?" asked Philip, after a while.
+
+"You'll see for yourself."
+
+There was another pause. Mr. Carey wondered why the headmaster did not
+come. Presently Philip made an effort and spoke again.
+
+"Tell him I've got a club-foot," he said.
+
+Before Mr. Carey could speak the door burst open and Mr. Watson swept into
+the room. To Philip he seemed gigantic. He was a man of over six feet
+high, and broad, with enormous hands and a great red beard; he talked
+loudly in a jovial manner; but his aggressive cheerfulness struck terror
+in Philip's heart. He shook hands with Mr. Carey, and then took Philip's
+small hand in his.
+
+"Well, young fellow, are you glad to come to school?" he shouted.
+
+Philip reddened and found no word to answer.
+
+"How old are you?"
+
+"Nine," said Philip.
+
+"You must say sir," said his uncle.
+
+"I expect you've got a good lot to learn," the headmaster bellowed
+cheerily.
+
+To give the boy confidence he began to tickle him with rough fingers.
+Philip, feeling shy and uncomfortable, squirmed under his touch.
+
+"I've put him in the small dormitory for the present.... You'll like that,
+won't you?" he added to Philip. "Only eight of you in there. You won't
+feel so strange."
+
+Then the door opened, and Mrs. Watson came in. She was a dark woman with
+black hair, neatly parted in the middle. She had curiously thick lips and
+a small round nose. Her eyes were large and black. There was a singular
+coldness in her appearance. She seldom spoke and smiled more seldom still.
+Her husband introduced Mr. Carey to her, and then gave Philip a friendly
+push towards her.
+
+"This is a new boy, Helen, His name's Carey."
+
+Without a word she shook hands with Philip and then sat down, not
+speaking, while the headmaster asked Mr. Carey how much Philip knew and
+what books he had been working with. The Vicar of Blackstable was a little
+embarrassed by Mr. Watson's boisterous heartiness, and in a moment or two
+got up.
+
+"I think I'd better leave Philip with you now."
+
+"That's all right," said Mr. Watson. "He'll be safe with me. He'll get on
+like a house on fire. Won't you, young fellow?"
+
+Without waiting for an answer from Philip the big man burst into a great
+bellow of laughter. Mr. Carey kissed Philip on the forehead and went away.
+
+"Come along, young fellow," shouted Mr. Watson. "I'll show you the
+school-room."
+
+He swept out of the drawing-room with giant strides, and Philip hurriedly
+limped behind him. He was taken into a long, bare room with two tables
+that ran along its whole length; on each side of them were wooden forms.
+
+"Nobody much here yet," said Mr. Watson. "I'll just show you the
+playground, and then I'll leave you to shift for yourself."
+
+Mr. Watson led the way. Philip found himself in a large play-ground with
+high brick walls on three sides of it. On the fourth side was an iron
+railing through which you saw a vast lawn and beyond this some of the
+buildings of King's School. One small boy was wandering disconsolately,
+kicking up the gravel as he walked.
+
+"Hulloa, Venning," shouted Mr. Watson. "When did you turn up?"
+
+The small boy came forward and shook hands.
+
+"Here's a new boy. He's older and bigger than you, so don't you bully
+him."
+
+The headmaster glared amicably at the two children, filling them with fear
+by the roar of his voice, and then with a guffaw left them.
+
+"What's your name?"
+
+"Carey."
+
+"What's your father?"
+
+"He's dead."
+
+"Oh! Does your mother wash?"
+
+"My mother's dead, too."
+
+Philip thought this answer would cause the boy a certain awkwardness, but
+Venning was not to be turned from his facetiousness for so little.
+
+"Well, did she wash?" he went on.
+
+"Yes," said Philip indignantly.
+
+"She was a washerwoman then?"
+
+"No, she wasn't."
+
+"Then she didn't wash."
+
+The little boy crowed with delight at the success of his dialectic. Then
+he caught sight of Philip's feet.
+
+"What's the matter with your foot?"
+
+Philip instinctively tried to withdraw it from sight. He hid it behind the
+one which was whole.
+
+"I've got a club-foot," he answered.
+
+"How did you get it?"
+
+"I've always had it."
+
+"Let's have a look."
+
+"No."
+
+"Don't then."
+
+The little boy accompanied the words with a sharp kick on Philip's shin,
+which Philip did not expect and thus could not guard against. The pain was
+so great that it made him gasp, but greater than the pain was the
+surprise. He did not know why Venning kicked him. He had not the presence
+of mind to give him a black eye. Besides, the boy was smaller than he, and
+he had read in The Boy's Own Paper that it was a mean thing to hit
+anyone smaller than yourself. While Philip was nursing his shin a third
+boy appeared, and his tormentor left him. In a little while he noticed
+that the pair were talking about him, and he felt they were looking at his
+feet. He grew hot and uncomfortable.
+
+But others arrived, a dozen together, and then more, and they began to
+talk about their doings during the holidays, where they had been, and what
+wonderful cricket they had played. A few new boys appeared, and with these
+presently Philip found himself talking. He was shy and nervous. He was
+anxious to make himself pleasant, but he could not think of anything to
+say. He was asked a great many questions and answered them all quite
+willingly. One boy asked him whether he could play cricket.
+
+"No," answered Philip. "I've got a club-foot."
+
+The boy looked down quickly and reddened. Philip saw that he felt he had
+asked an unseemly question. He was too shy to apologise and looked at
+Philip awkwardly.
+
+
+
+XI
+
+
+Next morning when the clanging of a bell awoke Philip he looked round his
+cubicle in astonishment. Then a voice sang out, and he remembered where he
+was.
+
+"Are you awake, Singer?"
+
+The partitions of the cubicle were of polished pitch-pine, and there was
+a green curtain in front. In those days there was little thought of
+ventilation, and the windows were closed except when the dormitory was
+aired in the morning.
+
+Philip got up and knelt down to say his prayers. It was a cold morning,
+and he shivered a little; but he had been taught by his uncle that his
+prayers were more acceptable to God if he said them in his nightshirt than
+if he waited till he was dressed. This did not surprise him, for he was
+beginning to realise that he was the creature of a God who appreciated the
+discomfort of his worshippers. Then he washed. There were two baths for
+the fifty boarders, and each boy had a bath once a week. The rest of his
+washing was done in a small basin on a wash-stand, which with the bed and
+a chair, made up the furniture of each cubicle. The boys chatted gaily
+while they dressed. Philip was all ears. Then another bell sounded, and
+they ran downstairs. They took their seats on the forms on each side of
+the two long tables in the school-room; and Mr. Watson, followed by his
+wife and the servants, came in and sat down. Mr. Watson read prayers in an
+impressive manner, and the supplications thundered out in his loud voice
+as though they were threats personally addressed to each boy. Philip
+listened with anxiety. Then Mr. Watson read a chapter from the Bible, and
+the servants trooped out. In a moment the untidy youth brought in two
+large pots of tea and on a second journey immense dishes of bread and
+butter.
+
+Philip had a squeamish appetite, and the thick slabs of poor butter on the
+bread turned his stomach, but he saw other boys scraping it off and
+followed their example. They all had potted meats and such like, which
+they had brought in their play-boxes; and some had 'extras,' eggs or
+bacon, upon which Mr. Watson made a profit. When he had asked Mr. Carey
+whether Philip was to have these, Mr. Carey replied that he did not think
+boys should be spoilt. Mr. Watson quite agreed with him--he considered
+nothing was better than bread and butter for growing lads--but some
+parents, unduly pampering their offspring, insisted on it.
+
+Philip noticed that 'extras' gave boys a certain consideration and made up
+his mind, when he wrote to Aunt Louisa, to ask for them.
+
+After breakfast the boys wandered out into the play-ground. Here the
+day-boys were gradually assembling. They were sons of the local clergy, of
+the officers at the Depot, and of such manufacturers or men of business as
+the old town possessed. Presently a bell rang, and they all trooped into
+school. This consisted of a large, long room at opposite ends of which two
+under-masters conducted the second and third forms, and of a smaller one,
+leading out of it, used by Mr. Watson, who taught the first form. To
+attach the preparatory to the senior school these three classes were known
+officially, on speech days and in reports, as upper, middle, and lower
+second. Philip was put in the last. The master, a red-faced man with a
+pleasant voice, was called Rice; he had a jolly manner with boys, and the
+time passed quickly. Philip was surprised when it was a quarter to eleven
+and they were let out for ten minutes' rest.
+
+The whole school rushed noisily into the play-ground. The new boys were
+told to go into the middle, while the others stationed themselves along
+opposite walls. They began to play Pig in the Middle. The old boys ran
+from wall to wall while the new boys tried to catch them: when one was
+seized and the mystic words said--one, two, three, and a pig for me--he
+became a prisoner and, turning sides, helped to catch those who were still
+free. Philip saw a boy running past and tried to catch him, but his limp
+gave him no chance; and the runners, taking their opportunity, made
+straight for the ground he covered. Then one of them had the brilliant
+idea of imitating Philip's clumsy run. Other boys saw it and began to
+laugh; then they all copied the first; and they ran round Philip, limping
+grotesquely, screaming in their treble voices with shrill laughter. They
+lost their heads with the delight of their new amusement, and choked with
+helpless merriment. One of them tripped Philip up and he fell, heavily as
+he always fell, and cut his knee. They laughed all the louder when he got
+up. A boy pushed him from behind, and he would have fallen again if
+another had not caught him. The game was forgotten in the entertainment of
+Philip's deformity. One of them invented an odd, rolling limp that struck
+the rest as supremely ridiculous, and several of the boys lay down on the
+ground and rolled about in laughter: Philip was completely scared. He
+could not make out why they were laughing at him. His heart beat so that
+he could hardly breathe, and he was more frightened than he had ever been
+in his life. He stood still stupidly while the boys ran round him,
+mimicking and laughing; they shouted to him to try and catch them; but he
+did not move. He did not want them to see him run any more. He was using
+all his strength to prevent himself from crying.
+
+Suddenly the bell rang, and they all trooped back to school. Philip's knee
+was bleeding, and he was dusty and dishevelled. For some minutes Mr. Rice
+could not control his form. They were excited still by the strange
+novelty, and Philip saw one or two of them furtively looking down at his
+feet. He tucked them under the bench.
+
+In the afternoon they went up to play football, but Mr. Watson stopped
+Philip on the way out after dinner.
+
+"I suppose you can't play football, Carey?" he asked him.
+
+Philip blushed self-consciously.
+
+"No, sir."
+
+"Very well. You'd better go up to the field. You can walk as far as that,
+can't you?"
+
+Philip had no idea where the field was, but he answered all the same.
+
+"Yes, sir."
+
+The boys went in charge of Mr. Rice, who glanced at Philip and seeing he
+had not changed, asked why he was not going to play.
+
+"Mr. Watson said I needn't, sir," said Philip.
+
+"Why?"
+
+There were boys all round him, looking at him curiously, and a feeling of
+shame came over Philip. He looked down without answering. Others gave the
+reply.
+
+"He's got a club-foot, sir."
+
+"Oh, I see."
+
+Mr. Rice was quite young; he had only taken his degree a year before; and
+he was suddenly embarrassed. His instinct was to beg the boy's pardon, but
+he was too shy to do so. He made his voice gruff and loud.
+
+"Now then, you boys, what are you waiting about for? Get on with you."
+
+Some of them had already started and those that were left now set off, in
+groups of two or three.
+
+"You'd better come along with me, Carey," said the master "You don't know
+the way, do you?"
+
+Philip guessed the kindness, and a sob came to his throat.
+
+"I can't go very fast, sir."
+
+"Then I'll go very slow," said the master, with a smile.
+
+Philip's heart went out to the red-faced, commonplace young man who said
+a gentle word to him. He suddenly felt less unhappy.
+
+But at night when they went up to bed and were undressing, the boy who was
+called Singer came out of his cubicle and put his head in Philip's.
+
+"I say, let's look at your foot," he said.
+
+"No," answered Philip.
+
+He jumped into bed quickly.
+
+"Don't say no to me," said Singer. "Come on, Mason."
+
+The boy in the next cubicle was looking round the corner, and at the words
+he slipped in. They made for Philip and tried to tear the bed-clothes off
+him, but he held them tightly.
+
+"Why can't you leave me alone?" he cried.
+
+Singer seized a brush and with the back of it beat Philip's hands clenched
+on the blanket. Philip cried out.
+
+"Why don't you show us your foot quietly?"
+
+"I won't."
+
+In desperation Philip clenched his fist and hit the boy who tormented him,
+but he was at a disadvantage, and the boy seized his arm. He began to turn
+it.
+
+"Oh, don't, don't," said Philip. "You'll break my arm."
+
+"Stop still then and put out your foot."
+
+Philip gave a sob and a gasp. The boy gave the arm another wrench. The
+pain was unendurable.
+
+"All right. I'll do it," said Philip.
+
+He put out his foot. Singer still kept his hand on Philip's wrist. He
+looked curiously at the deformity.
+
+"Isn't it beastly?" said Mason.
+
+Another came in and looked too.
+
+"Ugh," he said, in disgust.
+
+"My word, it is rum," said Singer, making a face. "Is it hard?"
+
+He touched it with the tip of his forefinger, cautiously
<TRUNCATED>
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/orders.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/orders.txt b/crunch-core/src/it/resources/orders.txt
new file mode 100644
index 0000000..2f1383f
--- /dev/null
+++ b/crunch-core/src/it/resources/orders.txt
@@ -0,0 +1,4 @@
+222|Toilet plunger
+333|Toilet brush
+222|Toilet paper
+111|Corn flakes
\ No newline at end of file
[17/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/org/apache/crunch/UnionITData/src1.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/org/apache/crunch/UnionITData/src1.txt b/crunch/src/it/resources/org/apache/crunch/UnionITData/src1.txt
deleted file mode 100644
index a92974b..0000000
--- a/crunch/src/it/resources/org/apache/crunch/UnionITData/src1.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-a1
-b2
-a1
-a1
-b2
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/org/apache/crunch/UnionITData/src2.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/org/apache/crunch/UnionITData/src2.txt b/crunch/src/it/resources/org/apache/crunch/UnionITData/src2.txt
deleted file mode 100644
index 9363398..0000000
--- a/crunch/src/it/resources/org/apache/crunch/UnionITData/src2.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-c3
-a1
-c3
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/org/apache/crunch/fn/AggregatorsITData/ints.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/org/apache/crunch/fn/AggregatorsITData/ints.txt b/crunch/src/it/resources/org/apache/crunch/fn/AggregatorsITData/ints.txt
deleted file mode 100644
index 680cb09..0000000
--- a/crunch/src/it/resources/org/apache/crunch/fn/AggregatorsITData/ints.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-a 1 2
-a 3 4
-b 2 3
-a 5 6
-b 9 10
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/org/apache/crunch/lib/CogroupITData/src1.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/org/apache/crunch/lib/CogroupITData/src1.txt b/crunch/src/it/resources/org/apache/crunch/lib/CogroupITData/src1.txt
deleted file mode 100644
index 9f38eb9..0000000
--- a/crunch/src/it/resources/org/apache/crunch/lib/CogroupITData/src1.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-a,1-1
-b,1-2
-c,1-3
-a,1-4
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/org/apache/crunch/lib/CogroupITData/src2.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/org/apache/crunch/lib/CogroupITData/src2.txt b/crunch/src/it/resources/org/apache/crunch/lib/CogroupITData/src2.txt
deleted file mode 100644
index ed9524e..0000000
--- a/crunch/src/it/resources/org/apache/crunch/lib/CogroupITData/src2.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-b,2-1
-c,2-2
-c,2-3
-d,2-4
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/secondary_sort_input.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/secondary_sort_input.txt b/crunch/src/it/resources/secondary_sort_input.txt
deleted file mode 100644
index 3c7be93..0000000
--- a/crunch/src/it/resources/secondary_sort_input.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-one,1,1
-one,2,-3
-two,4,5
-two,2,6
-two,1,7,9
-three,0,-1
-one,-5,10
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/set1.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/set1.txt b/crunch/src/it/resources/set1.txt
deleted file mode 100644
index 3b67f57..0000000
--- a/crunch/src/it/resources/set1.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-b
-c
-a
-e
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/set2.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/set2.txt b/crunch/src/it/resources/set2.txt
deleted file mode 100644
index 8169ab5..0000000
--- a/crunch/src/it/resources/set2.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-c
-d
-a
\ No newline at end of file
[33/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/PCollectionImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/PCollectionImpl.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/PCollectionImpl.java
new file mode 100644
index 0000000..6ea9c4c
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/PCollectionImpl.java
@@ -0,0 +1,295 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.collect;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.FilterFn;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PObject;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.ParallelDoOptions;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.Target;
+import org.apache.crunch.fn.ExtractKeyFn;
+import org.apache.crunch.fn.IdentityFn;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.impl.mr.plan.DoNode;
+import org.apache.crunch.lib.Aggregate;
+import org.apache.crunch.materialize.pobject.CollectionPObject;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+public abstract class PCollectionImpl<S> implements PCollection<S> {
+
+ private static final Log LOG = LogFactory.getLog(PCollectionImpl.class);
+
+ private final String name;
+ protected MRPipeline pipeline;
+ protected SourceTarget<S> materializedAt;
+ private final ParallelDoOptions options;
+
+ public PCollectionImpl(String name) {
+ this(name, ParallelDoOptions.builder().build());
+ }
+
+ public PCollectionImpl(String name, ParallelDoOptions options) {
+ this.name = name;
+ this.options = options;
+ }
+
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ @Override
+ public String toString() {
+ return getName();
+ }
+
+ @Override
+ public PCollection<S> union(PCollection<S> other) {
+ return union(new PCollection[] { other });
+ }
+
+ @Override
+ public PCollection<S> union(PCollection<S>... collections) {
+ List<PCollectionImpl<S>> internal = Lists.newArrayList();
+ internal.add(this);
+ for (PCollection<S> collection : collections) {
+ internal.add((PCollectionImpl<S>) collection.parallelDo(IdentityFn.<S>getInstance(), collection.getPType()));
+ }
+ return new UnionCollection<S>(internal);
+ }
+
+ @Override
+ public <T> PCollection<T> parallelDo(DoFn<S, T> fn, PType<T> type) {
+ MRPipeline pipeline = (MRPipeline) getPipeline();
+ return parallelDo("S" + pipeline.getNextAnonymousStageId(), fn, type);
+ }
+
+ @Override
+ public <T> PCollection<T> parallelDo(String name, DoFn<S, T> fn, PType<T> type) {
+ return new DoCollectionImpl<T>(name, getChainingCollection(), fn, type);
+ }
+
+ @Override
+ public <T> PCollection<T> parallelDo(String name, DoFn<S, T> fn, PType<T> type,
+ ParallelDoOptions options) {
+ return new DoCollectionImpl<T>(name, getChainingCollection(), fn, type, options);
+ }
+
+ @Override
+ public <K, V> PTable<K, V> parallelDo(DoFn<S, Pair<K, V>> fn, PTableType<K, V> type) {
+ MRPipeline pipeline = (MRPipeline) getPipeline();
+ return parallelDo("S" + pipeline.getNextAnonymousStageId(), fn, type);
+ }
+
+ @Override
+ public <K, V> PTable<K, V> parallelDo(String name, DoFn<S, Pair<K, V>> fn, PTableType<K, V> type) {
+ return new DoTableImpl<K, V>(name, getChainingCollection(), fn, type);
+ }
+
+ @Override
+ public <K, V> PTable<K, V> parallelDo(String name, DoFn<S, Pair<K, V>> fn, PTableType<K, V> type,
+ ParallelDoOptions options) {
+ return new DoTableImpl<K, V>(name, getChainingCollection(), fn, type, options);
+ }
+
+ public PCollection<S> write(Target target) {
+ if (materializedAt != null) {
+ getPipeline().write(new InputCollection<S>(materializedAt, (MRPipeline) getPipeline()), target);
+ } else {
+ getPipeline().write(this, target);
+ }
+ return this;
+ }
+
+ @Override
+ public PCollection<S> write(Target target, Target.WriteMode writeMode) {
+ if (materializedAt != null) {
+ getPipeline().write(new InputCollection<S>(materializedAt, (MRPipeline) getPipeline()), target,
+ writeMode);
+ } else {
+ getPipeline().write(this, target, writeMode);
+ }
+ return this;
+ }
+
+ @Override
+ public Iterable<S> materialize() {
+ if (getSize() == 0) {
+ LOG.warn("Materializing an empty PCollection: " + this.getName());
+ return Collections.emptyList();
+ }
+ return getPipeline().materialize(this);
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public PObject<Collection<S>> asCollection() {
+ return new CollectionPObject<S>(this);
+ }
+
+ public SourceTarget<S> getMaterializedAt() {
+ return materializedAt;
+ }
+
+ public void materializeAt(SourceTarget<S> sourceTarget) {
+ this.materializedAt = sourceTarget;
+ }
+
+ @Override
+ public PCollection<S> filter(FilterFn<S> filterFn) {
+ return parallelDo(filterFn, getPType());
+ }
+
+ @Override
+ public PCollection<S> filter(String name, FilterFn<S> filterFn) {
+ return parallelDo(name, filterFn, getPType());
+ }
+
+ @Override
+ public <K> PTable<K, S> by(MapFn<S, K> mapFn, PType<K> keyType) {
+ return parallelDo(new ExtractKeyFn<K, S>(mapFn), getTypeFamily().tableOf(keyType, getPType()));
+ }
+
+ @Override
+ public <K> PTable<K, S> by(String name, MapFn<S, K> mapFn, PType<K> keyType) {
+ return parallelDo(name, new ExtractKeyFn<K, S>(mapFn), getTypeFamily().tableOf(keyType, getPType()));
+ }
+
+ @Override
+ public PTable<S, Long> count() {
+ return Aggregate.count(this);
+ }
+
+ @Override
+ public PObject<Long> length() {
+ return Aggregate.length(this);
+ }
+
+ @Override
+ public PObject<S> max() {
+ return Aggregate.max(this);
+ }
+
+ @Override
+ public PObject<S> min() {
+ return Aggregate.min(this);
+ }
+
+ @Override
+ public PTypeFamily getTypeFamily() {
+ return getPType().getFamily();
+ }
+
+ public abstract DoNode createDoNode();
+
+ public abstract List<PCollectionImpl<?>> getParents();
+
+ public PCollectionImpl<?> getOnlyParent() {
+ List<PCollectionImpl<?>> parents = getParents();
+ if (parents.size() != 1) {
+ throw new IllegalArgumentException("Expected exactly one parent PCollection");
+ }
+ return parents.get(0);
+ }
+
+ @Override
+ public Pipeline getPipeline() {
+ if (pipeline == null) {
+ pipeline = (MRPipeline) getParents().get(0).getPipeline();
+ }
+ return pipeline;
+ }
+
+ public Set<SourceTarget<?>> getTargetDependencies() {
+ Set<SourceTarget<?>> targetDeps = options.getSourceTargets();
+ for (PCollectionImpl<?> parent : getParents()) {
+ targetDeps = Sets.union(targetDeps, parent.getTargetDependencies());
+ }
+ return targetDeps;
+ }
+
+ public int getDepth() {
+ int parentMax = 0;
+ for (PCollectionImpl parent : getParents()) {
+ parentMax = Math.max(parent.getDepth(), parentMax);
+ }
+ return 1 + parentMax;
+ }
+
+ public interface Visitor {
+ void visitInputCollection(InputCollection<?> collection);
+
+ void visitUnionCollection(UnionCollection<?> collection);
+
+ void visitDoFnCollection(DoCollectionImpl<?> collection);
+
+ void visitDoTable(DoTableImpl<?, ?> collection);
+
+ void visitGroupedTable(PGroupedTableImpl<?, ?> collection);
+ }
+
+ public void accept(Visitor visitor) {
+ if (materializedAt != null) {
+ visitor.visitInputCollection(new InputCollection<S>(materializedAt, (MRPipeline) getPipeline()));
+ } else {
+ acceptInternal(visitor);
+ }
+ }
+
+ protected abstract void acceptInternal(Visitor visitor);
+
+ @Override
+ public long getSize() {
+ if (materializedAt != null) {
+ long sz = materializedAt.getSize(getPipeline().getConfiguration());
+ if (sz > 0) {
+ return sz;
+ }
+ }
+ return getSizeInternal();
+ }
+
+ protected abstract long getSizeInternal();
+
+ /**
+ * Retrieve the PCollectionImpl to be used for chaining within PCollectionImpls further down the pipeline.
+ * @return The PCollectionImpl instance to be chained
+ */
+ protected PCollectionImpl<S> getChainingCollection(){
+ return this;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/PGroupedTableImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/PGroupedTableImpl.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/PGroupedTableImpl.java
new file mode 100644
index 0000000..ccac5d5
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/PGroupedTableImpl.java
@@ -0,0 +1,144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.collect;
+
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.Aggregator;
+import org.apache.crunch.CombineFn;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.GroupingOptions;
+import org.apache.crunch.PGroupedTable;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.fn.Aggregators;
+import org.apache.crunch.impl.mr.plan.DoNode;
+import org.apache.crunch.types.PGroupedTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.util.PartitionUtils;
+import org.apache.hadoop.mapreduce.Job;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Sets;
+
+public class PGroupedTableImpl<K, V> extends PCollectionImpl<Pair<K, Iterable<V>>> implements PGroupedTable<K, V> {
+
+ private static final Log LOG = LogFactory.getLog(PGroupedTableImpl.class);
+
+ private final PTableBase<K, V> parent;
+ private final GroupingOptions groupingOptions;
+ private final PGroupedTableType<K, V> ptype;
+
+ PGroupedTableImpl(PTableBase<K, V> parent) {
+ this(parent, null);
+ }
+
+ PGroupedTableImpl(PTableBase<K, V> parent, GroupingOptions groupingOptions) {
+ super("GBK");
+ this.parent = parent;
+ this.groupingOptions = groupingOptions;
+ this.ptype = parent.getPTableType().getGroupedTableType();
+ }
+
+ public void configureShuffle(Job job) {
+ ptype.configureShuffle(job, groupingOptions);
+ if (groupingOptions == null || groupingOptions.getNumReducers() <= 0) {
+ int numReduceTasks = PartitionUtils.getRecommendedPartitions(this, getPipeline().getConfiguration());
+ if (numReduceTasks > 0) {
+ job.setNumReduceTasks(numReduceTasks);
+ LOG.info(String.format("Setting num reduce tasks to %d", numReduceTasks));
+ } else {
+ LOG.warn("Attempted to set a negative number of reduce tasks");
+ }
+ }
+ }
+
+ @Override
+ protected long getSizeInternal() {
+ return parent.getSizeInternal();
+ }
+
+ @Override
+ public PType<Pair<K, Iterable<V>>> getPType() {
+ return ptype;
+ }
+
+ @Override
+ public PTable<K, V> combineValues(CombineFn<K, V> combineFn) {
+ return new DoTableImpl<K, V>("combine", getChainingCollection(), combineFn, parent.getPTableType());
+ }
+
+ @Override
+ public PTable<K, V> combineValues(Aggregator<V> agg) {
+ return combineValues(Aggregators.<K, V>toCombineFn(agg));
+ }
+
+ private static class Ungroup<K, V> extends DoFn<Pair<K, Iterable<V>>, Pair<K, V>> {
+ @Override
+ public void process(Pair<K, Iterable<V>> input, Emitter<Pair<K, V>> emitter) {
+ for (V v : input.second()) {
+ emitter.emit(Pair.of(input.first(), v));
+ }
+ }
+ }
+
+ public PTable<K, V> ungroup() {
+ return parallelDo("ungroup", new Ungroup<K, V>(), parent.getPTableType());
+ }
+
+ @Override
+ protected void acceptInternal(PCollectionImpl.Visitor visitor) {
+ visitor.visitGroupedTable(this);
+ }
+
+ @Override
+ public Set<SourceTarget<?>> getTargetDependencies() {
+ Set<SourceTarget<?>> td = Sets.newHashSet(super.getTargetDependencies());
+ if (groupingOptions != null) {
+ td.addAll(groupingOptions.getSourceTargets());
+ }
+ return ImmutableSet.copyOf(td);
+ }
+
+ @Override
+ public List<PCollectionImpl<?>> getParents() {
+ return ImmutableList.<PCollectionImpl<?>> of(parent);
+ }
+
+ @Override
+ public DoNode createDoNode() {
+ return DoNode.createFnNode(getName(), ptype.getInputMapFn(), ptype);
+ }
+
+ public DoNode getGroupingNode() {
+ return DoNode.createGroupingNode("", ptype);
+ }
+
+ @Override
+ protected PCollectionImpl<Pair<K, Iterable<V>>> getChainingCollection() {
+ // Use a copy for chaining to allow sending the output of a single grouped table to multiple outputs
+ // TODO This should be implemented in a cleaner way in the planner
+ return new PGroupedTableImpl<K, V>(parent, groupingOptions);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/PTableBase.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/PTableBase.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/PTableBase.java
new file mode 100644
index 0000000..3c2393d
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/PTableBase.java
@@ -0,0 +1,169 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.collect;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.crunch.FilterFn;
+import org.apache.crunch.GroupingOptions;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PObject;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.ParallelDoOptions;
+import org.apache.crunch.TableSource;
+import org.apache.crunch.Target;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.lib.Aggregate;
+import org.apache.crunch.lib.Cogroup;
+import org.apache.crunch.lib.Join;
+import org.apache.crunch.lib.PTables;
+import org.apache.crunch.materialize.MaterializableMap;
+import org.apache.crunch.materialize.pobject.MapPObject;
+import org.apache.crunch.types.PType;
+
+import com.google.common.collect.Lists;
+
+abstract class PTableBase<K, V> extends PCollectionImpl<Pair<K, V>> implements PTable<K, V> {
+
+ public PTableBase(String name) {
+ super(name);
+ }
+
+ public PTableBase(String name, ParallelDoOptions options) {
+ super(name, options);
+ }
+
+ public PType<K> getKeyType() {
+ return getPTableType().getKeyType();
+ }
+
+ public PType<V> getValueType() {
+ return getPTableType().getValueType();
+ }
+
+ public PGroupedTableImpl<K, V> groupByKey() {
+ return new PGroupedTableImpl<K, V>(this);
+ }
+
+ public PGroupedTableImpl<K, V> groupByKey(int numReduceTasks) {
+ return new PGroupedTableImpl<K, V>(this, GroupingOptions.builder().numReducers(numReduceTasks).build());
+ }
+
+ public PGroupedTableImpl<K, V> groupByKey(GroupingOptions groupingOptions) {
+ return new PGroupedTableImpl<K, V>(this, groupingOptions);
+ }
+
+ @Override
+ public PTable<K, V> union(PTable<K, V> other) {
+ return union(new PTable[] { other });
+ }
+
+ @Override
+ public PTable<K, V> union(PTable<K, V>... others) {
+ List<PTableBase<K, V>> internal = Lists.newArrayList();
+ internal.add(this);
+ for (PTable<K, V> table : others) {
+ internal.add((PTableBase<K, V>) table);
+ }
+ return new UnionTable<K, V>(internal);
+ }
+
+ @Override
+ public PTable<K, V> write(Target target) {
+ if (getMaterializedAt() != null) {
+ getPipeline().write(new InputTable<K, V>(
+ (TableSource<K, V>) getMaterializedAt(), (MRPipeline) getPipeline()), target);
+ } else {
+ getPipeline().write(this, target);
+ }
+ return this;
+ }
+
+ @Override
+ public PTable<K, V> write(Target target, Target.WriteMode writeMode) {
+ if (getMaterializedAt() != null) {
+ getPipeline().write(new InputTable<K, V>(
+ (TableSource<K, V>) getMaterializedAt(), (MRPipeline) getPipeline()), target, writeMode);
+ } else {
+ getPipeline().write(this, target, writeMode);
+ }
+ return this;
+ }
+
+ @Override
+ public PTable<K, V> filter(FilterFn<Pair<K, V>> filterFn) {
+ return parallelDo(filterFn, getPTableType());
+ }
+
+ @Override
+ public PTable<K, V> filter(String name, FilterFn<Pair<K, V>> filterFn) {
+ return parallelDo(name, filterFn, getPTableType());
+ }
+
+ @Override
+ public PTable<K, V> top(int count) {
+ return Aggregate.top(this, count, true);
+ }
+
+ @Override
+ public PTable<K, V> bottom(int count) {
+ return Aggregate.top(this, count, false);
+ }
+
+ @Override
+ public PTable<K, Collection<V>> collectValues() {
+ return Aggregate.collectValues(this);
+ }
+
+ @Override
+ public <U> PTable<K, Pair<V, U>> join(PTable<K, U> other) {
+ return Join.join(this, other);
+ }
+
+ @Override
+ public <U> PTable<K, Pair<Collection<V>, Collection<U>>> cogroup(PTable<K, U> other) {
+ return Cogroup.cogroup(this, other);
+ }
+
+ @Override
+ public PCollection<K> keys() {
+ return PTables.keys(this);
+ }
+
+ @Override
+ public PCollection<V> values() {
+ return PTables.values(this);
+ }
+
+ /**
+ * Returns a Map<K, V> made up of the keys and values in this PTable.
+ */
+ @Override
+ public Map<K, V> materializeToMap() {
+ return new MaterializableMap<K, V>(this.materialize());
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public PObject<Map<K, V>> asMap() {
+ return new MapPObject<K, V>(this);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/UnionCollection.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/UnionCollection.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/UnionCollection.java
new file mode 100644
index 0000000..7b3dd7b
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/UnionCollection.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.collect;
+
+import java.util.List;
+
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.impl.mr.plan.DoNode;
+import org.apache.crunch.types.PType;
+
+import com.google.common.collect.ImmutableList;
+
+public class UnionCollection<S> extends PCollectionImpl<S> {
+
+ private List<PCollectionImpl<S>> parents;
+ private long size = 0;
+
+ private static <S> String flatName(List<PCollectionImpl<S>> collections) {
+ StringBuilder sb = new StringBuilder("union(");
+ for (int i = 0; i < collections.size(); i++) {
+ if (i != 0) {
+ sb.append(',');
+ }
+ sb.append(collections.get(i).getName());
+ }
+ return sb.append(')').toString();
+ }
+
+ UnionCollection(List<PCollectionImpl<S>> collections) {
+ super(flatName(collections));
+ this.parents = ImmutableList.copyOf(collections);
+ this.pipeline = (MRPipeline) parents.get(0).getPipeline();
+ for (PCollectionImpl<S> parent : parents) {
+ if (this.pipeline != parent.getPipeline()) {
+ throw new IllegalStateException("Cannot union PCollections from different Pipeline instances");
+ }
+ size += parent.getSize();
+ }
+ }
+
+ @Override
+ protected long getSizeInternal() {
+ return size;
+ }
+
+ @Override
+ protected void acceptInternal(PCollectionImpl.Visitor visitor) {
+ visitor.visitUnionCollection(this);
+ }
+
+ @Override
+ public PType<S> getPType() {
+ return parents.get(0).getPType();
+ }
+
+ @Override
+ public List<PCollectionImpl<?>> getParents() {
+ return ImmutableList.<PCollectionImpl<?>> copyOf(parents);
+ }
+
+ @Override
+ public DoNode createDoNode() {
+ throw new UnsupportedOperationException("Unioned collection does not support DoNodes");
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/UnionTable.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/UnionTable.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/UnionTable.java
new file mode 100644
index 0000000..a369432
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/UnionTable.java
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.collect;
+
+import java.util.List;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.impl.mr.plan.DoNode;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+
+public class UnionTable<K, V> extends PTableBase<K, V> {
+
+ private PTableType<K, V> ptype;
+ private List<PCollectionImpl<Pair<K, V>>> parents;
+ private long size;
+
+ private static <K, V> String flatName(List<PTableBase<K, V>> tables) {
+ StringBuilder sb = new StringBuilder("union(");
+ for (int i = 0; i < tables.size(); i++) {
+ if (i != 0) {
+ sb.append(',');
+ }
+ sb.append(tables.get(i).getName());
+ }
+ return sb.append(')').toString();
+ }
+
+ public UnionTable(List<PTableBase<K, V>> tables) {
+ super(flatName(tables));
+ this.ptype = tables.get(0).getPTableType();
+ this.pipeline = (MRPipeline) tables.get(0).getPipeline();
+ this.parents = Lists.newArrayList();
+ for (PTableBase<K, V> parent : tables) {
+ if (pipeline != parent.getPipeline()) {
+ throw new IllegalStateException("Cannot union PTables from different Pipeline instances");
+ }
+ this.parents.add(parent);
+ size += parent.getSize();
+ }
+ }
+
+ @Override
+ protected long getSizeInternal() {
+ return size;
+ }
+
+ @Override
+ public PTableType<K, V> getPTableType() {
+ return ptype;
+ }
+
+ @Override
+ public PType<Pair<K, V>> getPType() {
+ return ptype;
+ }
+
+ @Override
+ public List<PCollectionImpl<?>> getParents() {
+ return ImmutableList.<PCollectionImpl<?>> copyOf(parents);
+ }
+
+ @Override
+ protected void acceptInternal(PCollectionImpl.Visitor visitor) {
+ visitor.visitUnionCollection(new UnionCollection<Pair<K, V>>(parents));
+ }
+
+ @Override
+ public DoNode createDoNode() {
+ throw new UnsupportedOperationException("Unioned table does not support do nodes");
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/emit/IntermediateEmitter.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/emit/IntermediateEmitter.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/emit/IntermediateEmitter.java
new file mode 100644
index 0000000..b6df98b
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/emit/IntermediateEmitter.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.emit;
+
+import java.util.List;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.impl.mr.run.RTNode;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.collect.ImmutableList;
+
+/**
+ * An {@link Emitter} implementation that links the output of one {@link DoFn} to the input of
+ * another {@code DoFn}.
+ *
+ */
+public class IntermediateEmitter implements Emitter<Object> {
+
+ private final List<RTNode> children;
+ private final Configuration conf;
+ private final PType<Object> outputPType;
+ private final boolean needDetachedValues;
+
+ public IntermediateEmitter(PType<Object> outputPType, List<RTNode> children, Configuration conf) {
+ this.outputPType = outputPType;
+ this.children = ImmutableList.copyOf(children);
+ this.conf = conf;
+
+ outputPType.initialize(conf);
+ needDetachedValues = this.children.size() > 1;
+ }
+
+ public void emit(Object emitted) {
+ for (RTNode child : children) {
+ Object value = emitted;
+ if (needDetachedValues) {
+ value = this.outputPType.getDetachedValue(emitted);
+ }
+ child.process(value);
+ }
+ }
+
+ public void flush() {
+ // No-op
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/emit/MultipleOutputEmitter.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/emit/MultipleOutputEmitter.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/emit/MultipleOutputEmitter.java
new file mode 100644
index 0000000..2e58fed
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/emit/MultipleOutputEmitter.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.emit;
+
+import java.io.IOException;
+
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.io.CrunchOutputs;
+import org.apache.crunch.types.Converter;
+
+public class MultipleOutputEmitter<T, K, V> implements Emitter<T> {
+
+ private final Converter converter;
+ private final CrunchOutputs<K, V> outputs;
+ private final String outputName;
+
+ public MultipleOutputEmitter(Converter converter, CrunchOutputs<K, V> outputs,
+ String outputName) {
+ this.converter = converter;
+ this.outputs = outputs;
+ this.outputName = outputName;
+ }
+
+ @Override
+ public void emit(T emitted) {
+ try {
+ this.outputs.write(outputName,
+ (K) converter.outputKey(emitted),
+ (V) converter.outputValue(emitted));
+ } catch (Exception e) {
+ throw new CrunchRuntimeException(e);
+ }
+ }
+
+ @Override
+ public void flush() {
+ // No-op
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/emit/OutputEmitter.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/emit/OutputEmitter.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/emit/OutputEmitter.java
new file mode 100644
index 0000000..bc3ae0d
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/emit/OutputEmitter.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.emit;
+
+import java.io.IOException;
+
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.types.Converter;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+
+public class OutputEmitter<T, K, V> implements Emitter<T> {
+
+ private final Converter<K, V, Object, Object> converter;
+ private final TaskInputOutputContext<?, ?, K, V> context;
+
+ public OutputEmitter(Converter<K, V, Object, Object> converter, TaskInputOutputContext<?, ?, K, V> context) {
+ this.converter = converter;
+ this.context = context;
+ }
+
+ public void emit(T emitted) {
+ try {
+ K key = converter.outputKey(emitted);
+ V value = converter.outputValue(emitted);
+ this.context.write(key, value);
+ } catch (IOException e) {
+ throw new CrunchRuntimeException(e);
+ } catch (InterruptedException e) {
+ throw new CrunchRuntimeException(e);
+ }
+ }
+
+ public void flush() {
+ // No-op
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounter.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounter.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounter.java
new file mode 100644
index 0000000..d90f2e8
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounter.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.exec;
+
+/**
+ * Generate a series of capped numbers exponentially.
+ *
+ * It is used for creating retry intervals. It is NOT thread-safe.
+ */
+public class CappedExponentialCounter {
+
+ private long current;
+ private final long limit;
+
+ public CappedExponentialCounter(long start, long limit) {
+ this.current = start;
+ this.limit = limit;
+ }
+
+ public long get() {
+ long result = current;
+ current = Math.min(current * 2, limit);
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/exec/CrunchJobHooks.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/exec/CrunchJobHooks.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/exec/CrunchJobHooks.java
new file mode 100644
index 0000000..74bc9ac
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/exec/CrunchJobHooks.java
@@ -0,0 +1,153 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.exec;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchControlledJob;
+import org.apache.crunch.impl.mr.plan.PlanningParameters;
+import org.apache.crunch.impl.mr.run.RuntimeParameters;
+import org.apache.crunch.io.FileNamingScheme;
+import org.apache.crunch.io.PathTarget;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+
+public final class CrunchJobHooks {
+
+ private CrunchJobHooks() {}
+
+ /** Creates missing input directories before job is submitted. */
+ public static final class PrepareHook implements CrunchControlledJob.Hook {
+ private final Job job;
+
+ public PrepareHook(Job job) {
+ this.job = job;
+ }
+
+ @Override
+ public void run() throws IOException {
+ Configuration conf = job.getConfiguration();
+ if (conf.getBoolean(RuntimeParameters.CREATE_DIR, false)) {
+ Path[] inputPaths = FileInputFormat.getInputPaths(job);
+ for (Path inputPath : inputPaths) {
+ FileSystem fs = inputPath.getFileSystem(conf);
+ if (!fs.exists(inputPath)) {
+ try {
+ fs.mkdirs(inputPath);
+ } catch (IOException e) {
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /** Moving output files produced by the MapReduce job to specified directories. */
+ public static final class CompletionHook implements CrunchControlledJob.Hook {
+ private final Job job;
+ private final Path workingPath;
+ private final Map<Integer, PathTarget> multiPaths;
+ private final boolean mapOnlyJob;
+
+ public CompletionHook(Job job, Path workingPath, Map<Integer, PathTarget> multiPaths, boolean mapOnlyJob) {
+ this.job = job;
+ this.workingPath = workingPath;
+ this.multiPaths = multiPaths;
+ this.mapOnlyJob = mapOnlyJob;
+ }
+
+ @Override
+ public void run() throws IOException {
+ handleMultiPaths();
+ }
+
+ private synchronized void handleMultiPaths() throws IOException {
+ if (!multiPaths.isEmpty()) {
+ // Need to handle moving the data from the output directory of the
+ // job to the output locations specified in the paths.
+ FileSystem srcFs = workingPath.getFileSystem(job.getConfiguration());
+ for (Map.Entry<Integer, PathTarget> entry : multiPaths.entrySet()) {
+ final int i = entry.getKey();
+ final Path dst = entry.getValue().getPath();
+ FileNamingScheme fileNamingScheme = entry.getValue().getFileNamingScheme();
+
+ Path src = new Path(workingPath, PlanningParameters.MULTI_OUTPUT_PREFIX + i + "-*");
+ Path[] srcs = FileUtil.stat2Paths(srcFs.globStatus(src), src);
+ Configuration conf = job.getConfiguration();
+ FileSystem dstFs = dst.getFileSystem(conf);
+ if (!dstFs.exists(dst)) {
+ dstFs.mkdirs(dst);
+ }
+ boolean sameFs = isCompatible(srcFs, dst);
+ for (Path s : srcs) {
+ Path d = getDestFile(conf, s, dst, fileNamingScheme);
+ if (sameFs) {
+ srcFs.rename(s, d);
+ } else {
+ FileUtil.copy(srcFs, s, dstFs, d, true, true, job.getConfiguration());
+ }
+ }
+ }
+ }
+ }
+
+ private boolean isCompatible(FileSystem fs, Path path) {
+ try {
+ fs.makeQualified(path);
+ return true;
+ } catch (IllegalArgumentException e) {
+ return false;
+ }
+ }
+ private Path getDestFile(Configuration conf, Path src, Path dir, FileNamingScheme fileNamingScheme)
+ throws IOException {
+ String outputFilename = null;
+ if (mapOnlyJob) {
+ outputFilename = fileNamingScheme.getMapOutputName(conf, dir);
+ } else {
+ outputFilename = fileNamingScheme.getReduceOutputName(conf, dir, extractPartitionNumber(src.getName()));
+ }
+ if (src.getName().endsWith(org.apache.avro.mapred.AvroOutputFormat.EXT)) {
+ outputFilename += org.apache.avro.mapred.AvroOutputFormat.EXT;
+ }
+ return new Path(dir, outputFilename);
+ }
+ }
+
+ /**
+ * Extract the partition number from a raw reducer output filename.
+ *
+ * @param reduceOutputFileName The raw reducer output file name
+ * @return The partition number encoded in the filename
+ */
+ static int extractPartitionNumber(String reduceOutputFileName) {
+ Matcher matcher = Pattern.compile(".*-r-(\\d{5})").matcher(reduceOutputFileName);
+ if (matcher.find()) {
+ return Integer.parseInt(matcher.group(1), 10);
+ } else {
+ throw new IllegalArgumentException("Reducer output name '" + reduceOutputFileName + "' cannot be parsed");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/exec/MRExecutor.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/exec/MRExecutor.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/exec/MRExecutor.java
new file mode 100644
index 0000000..4c7b7ea
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/exec/MRExecutor.java
@@ -0,0 +1,198 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.exec;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.PipelineExecution;
+import org.apache.crunch.PipelineResult;
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.Target;
+import org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchControlledJob;
+import org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchJobControl;
+import org.apache.crunch.impl.mr.collect.PCollectionImpl;
+import org.apache.crunch.materialize.MaterializableIterable;
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Provides APIs for job control at runtime to clients.
+ *
+ * This class has a thread that submits jobs when they become ready, monitors
+ * the states of the running jobs, and updates the states of jobs based on the
+ * state changes of their depending jobs states.
+ *
+ * It is thread-safe.
+ */
+public class MRExecutor implements PipelineExecution {
+
+ private static final Log LOG = LogFactory.getLog(MRExecutor.class);
+
+ private final CrunchJobControl control;
+ private final Map<PCollectionImpl<?>, Set<Target>> outputTargets;
+ private final Map<PCollectionImpl<?>, MaterializableIterable> toMaterialize;
+ private final CountDownLatch doneSignal = new CountDownLatch(1);
+ private final CountDownLatch killSignal = new CountDownLatch(1);
+ private final CappedExponentialCounter pollInterval;
+ private AtomicReference<Status> status = new AtomicReference<Status>(Status.READY);
+ private PipelineResult result;
+ private Thread monitorThread;
+
+ private String planDotFile;
+
+ public MRExecutor(Class<?> jarClass, Map<PCollectionImpl<?>, Set<Target>> outputTargets,
+ Map<PCollectionImpl<?>, MaterializableIterable> toMaterialize) {
+ this.control = new CrunchJobControl(jarClass.toString());
+ this.outputTargets = outputTargets;
+ this.toMaterialize = toMaterialize;
+ this.monitorThread = new Thread(new Runnable() {
+ @Override
+ public void run() {
+ monitorLoop();
+ }
+ });
+ this.pollInterval = isLocalMode()
+ ? new CappedExponentialCounter(50, 1000)
+ : new CappedExponentialCounter(500, 10000);
+ }
+
+ public void addJob(CrunchControlledJob job) {
+ this.control.addJob(job);
+ }
+
+ public void setPlanDotFile(String planDotFile) {
+ this.planDotFile = planDotFile;
+ }
+
+ public PipelineExecution execute() {
+ monitorThread.start();
+ return this;
+ }
+
+ /** Monitors running status. It is called in {@code MonitorThread}. */
+ private void monitorLoop() {
+ try {
+ while (killSignal.getCount() > 0 && !control.allFinished()) {
+ control.pollJobStatusAndStartNewOnes();
+ killSignal.await(pollInterval.get(), TimeUnit.MILLISECONDS);
+ }
+ control.killAllRunningJobs();
+
+ List<CrunchControlledJob> failures = control.getFailedJobList();
+ if (!failures.isEmpty()) {
+ System.err.println(failures.size() + " job failure(s) occurred:");
+ for (CrunchControlledJob job : failures) {
+ System.err.println(job.getJobName() + "(" + job.getJobID() + "): " + job.getMessage());
+ }
+ }
+ List<PipelineResult.StageResult> stages = Lists.newArrayList();
+ for (CrunchControlledJob job : control.getSuccessfulJobList()) {
+ stages.add(new PipelineResult.StageResult(job.getJobName(), job.getJob().getCounters()));
+ }
+
+ for (PCollectionImpl<?> c : outputTargets.keySet()) {
+ if (toMaterialize.containsKey(c)) {
+ MaterializableIterable iter = toMaterialize.get(c);
+ if (iter.isSourceTarget()) {
+ iter.materialize();
+ c.materializeAt((SourceTarget) iter.getSource());
+ }
+ } else {
+ boolean materialized = false;
+ for (Target t : outputTargets.get(c)) {
+ if (!materialized) {
+ if (t instanceof SourceTarget) {
+ c.materializeAt((SourceTarget) t);
+ materialized = true;
+ } else {
+ SourceTarget st = t.asSourceTarget(c.getPType());
+ if (st != null) {
+ c.materializeAt(st);
+ materialized = true;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ synchronized (this) {
+ result = new PipelineResult(stages);
+ if (killSignal.getCount() == 0) {
+ status.set(Status.KILLED);
+ } else {
+ status.set(result.succeeded() ? Status.SUCCEEDED : Status.FAILED);
+ }
+ }
+ } catch (InterruptedException e) {
+ throw new AssertionError(e); // Nobody should interrupt us.
+ } catch (IOException e) {
+ LOG.error("Pipeline failed due to exception", e);
+ status.set(Status.FAILED);
+ } finally {
+ doneSignal.countDown();
+ }
+ }
+
+ @Override
+ public String getPlanDotFile() {
+ return planDotFile;
+ }
+
+ @Override
+ public void waitFor(long timeout, TimeUnit timeUnit) throws InterruptedException {
+ doneSignal.await(timeout, timeUnit);
+ }
+
+ @Override
+ public void waitUntilDone() throws InterruptedException {
+ doneSignal.await();
+ }
+
+ @Override
+ public synchronized Status getStatus() {
+ return status.get();
+ }
+
+ @Override
+ public synchronized PipelineResult getResult() {
+ return result;
+ }
+
+ @Override
+ public void kill() throws InterruptedException {
+ killSignal.countDown();
+ }
+
+ private static boolean isLocalMode() {
+ Configuration conf = new Configuration();
+ // Try to handle MapReduce version 0.20 or 0.22
+ String jobTrackerAddress = conf.get("mapreduce.jobtracker.address",
+ conf.get("mapred.job.tracker", "local"));
+ return "local".equals(jobTrackerAddress);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/package-info.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/package-info.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/package-info.java
new file mode 100644
index 0000000..7e403c3
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/package-info.java
@@ -0,0 +1,22 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A Pipeline implementation that runs on Hadoop MapReduce.
+ */
+package org.apache.crunch.impl.mr;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/DoNode.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/DoNode.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/DoNode.java
new file mode 100644
index 0000000..865369c
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/DoNode.java
@@ -0,0 +1,163 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.plan;
+
+import java.util.List;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Source;
+import org.apache.crunch.impl.mr.run.NodeContext;
+import org.apache.crunch.impl.mr.run.RTNode;
+import org.apache.crunch.types.Converter;
+import org.apache.crunch.types.PGroupedTableType;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+
+public class DoNode {
+
+ private static final List<DoNode> NO_CHILDREN = ImmutableList.of();
+
+ private final DoFn fn;
+ private final String name;
+ private final PType<?> ptype;
+ private final List<DoNode> children;
+ private final Converter outputConverter;
+ private final Source<?> source;
+ private String outputName;
+
+ private DoNode(DoFn fn, String name, PType<?> ptype, List<DoNode> children, Converter outputConverter,
+ Source<?> source) {
+ this.fn = fn;
+ this.name = name;
+ this.ptype = ptype;
+ this.children = children;
+ this.outputConverter = outputConverter;
+ this.source = source;
+ }
+
+ private static List<DoNode> allowsChildren() {
+ return Lists.newArrayList();
+ }
+
+ public static <K, V> DoNode createGroupingNode(String name, PGroupedTableType<K, V> ptype) {
+ DoFn<?, ?> fn = ptype.getOutputMapFn();
+ return new DoNode(fn, name, ptype, NO_CHILDREN, ptype.getGroupingConverter(), null);
+ }
+
+ public static <S> DoNode createOutputNode(String name, PType<S> ptype) {
+ Converter outputConverter = ptype.getConverter();
+ DoFn<?, ?> fn = ptype.getOutputMapFn();
+ return new DoNode(fn, name, ptype, NO_CHILDREN, outputConverter, null);
+ }
+
+ public static DoNode createFnNode(String name, DoFn<?, ?> function, PType<?> ptype) {
+ return new DoNode(function, name, ptype, allowsChildren(), null, null);
+ }
+
+ public static <S> DoNode createInputNode(Source<S> source) {
+ PType<?> ptype = source.getType();
+ DoFn<?, ?> fn = ptype.getInputMapFn();
+ return new DoNode(fn, source.toString(), ptype, allowsChildren(), null, source);
+ }
+
+ public boolean isInputNode() {
+ return source != null;
+ }
+
+ public boolean isOutputNode() {
+ return outputConverter != null;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public List<DoNode> getChildren() {
+ return children;
+ }
+
+ public Source<?> getSource() {
+ return source;
+ }
+
+ public PType<?> getPType() {
+ return ptype;
+ }
+
+ public DoNode addChild(DoNode node) {
+ // TODO: This is sort of terrible, refactor the code to make this make more sense.
+ boolean exists = false;
+ for (DoNode child : children) {
+ if (node == child) {
+ exists = true;
+ break;
+ }
+ }
+ if (!exists) {
+ children.add(node);
+ }
+ return this;
+ }
+
+ public void setOutputName(String outputName) {
+ if (outputConverter == null) {
+ throw new IllegalStateException("Cannot set output name w/o output converter: " + outputName);
+ }
+ this.outputName = outputName;
+ }
+
+ public RTNode toRTNode(boolean inputNode, Configuration conf, NodeContext nodeContext) {
+ List<RTNode> childRTNodes = Lists.newArrayList();
+ fn.configure(conf);
+ for (DoNode child : children) {
+ childRTNodes.add(child.toRTNode(false, conf, nodeContext));
+ }
+
+ Converter inputConverter = null;
+ if (inputNode) {
+ if (nodeContext == NodeContext.MAP) {
+ inputConverter = ptype.getConverter();
+ } else {
+ inputConverter = ((PGroupedTableType<?, ?>) ptype).getGroupingConverter();
+ }
+ }
+ return new RTNode(fn, (PType<Object>) getPType(), name, childRTNodes, inputConverter, outputConverter, outputName);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == null || !(other instanceof DoNode)) {
+ return false;
+ }
+ if (this == other) {
+ return true;
+ }
+ DoNode o = (DoNode) other;
+ return (name.equals(o.name) && fn.equals(o.fn) && source == o.source && outputConverter == o.outputConverter);
+ }
+
+ @Override
+ public int hashCode() {
+ HashCodeBuilder hcb = new HashCodeBuilder();
+ return hcb.append(name).append(fn).append(source).append(outputConverter).toHashCode();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/DotfileWriter.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/DotfileWriter.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/DotfileWriter.java
new file mode 100644
index 0000000..46d8c53
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/DotfileWriter.java
@@ -0,0 +1,238 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.plan;
+
+import java.util.List;
+import java.util.Set;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.Target;
+import org.apache.crunch.impl.mr.collect.InputCollection;
+import org.apache.crunch.impl.mr.collect.PCollectionImpl;
+import org.apache.crunch.impl.mr.collect.PGroupedTableImpl;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+/**
+ * Writes <a href="http://www.graphviz.org">Graphviz</a> dot files to illustrate
+ * the topology of Crunch pipelines.
+ */
+public class DotfileWriter {
+
+ /** The types of tasks within a MapReduce job. */
+ enum MRTaskType { MAP, REDUCE };
+
+ private Set<JobPrototype> jobPrototypes = Sets.newHashSet();
+ private HashMultimap<Pair<JobPrototype, MRTaskType>, String> jobNodeDeclarations = HashMultimap.create();
+ private Set<String> globalNodeDeclarations = Sets.newHashSet();
+ private Set<String> nodePathChains = Sets.newHashSet();
+
+ /**
+ * Format the declaration of a node based on a PCollection.
+ *
+ * @param pcollectionImpl PCollection for which a node will be declared
+ * @param jobPrototype The job containing the PCollection
+ * @return The node declaration
+ */
+ String formatPCollectionNodeDeclaration(PCollectionImpl<?> pcollectionImpl, JobPrototype jobPrototype) {
+ String shape = "box";
+ if (pcollectionImpl instanceof InputCollection) {
+ shape = "folder";
+ }
+ return String.format("%s [label=\"%s\" shape=%s];", formatPCollection(pcollectionImpl, jobPrototype), pcollectionImpl.getName(),
+ shape);
+ }
+
+ /**
+ * Format a Target as a node declaration.
+ *
+ * @param target A Target used within a MapReduce pipeline
+ * @return The global node declaration for the Target
+ */
+ String formatTargetNodeDeclaration(Target target) {
+ return String.format("\"%s\" [label=\"%s\" shape=folder];", target.toString(), target.toString());
+ }
+
+ /**
+ * Format a PCollectionImpl into a format to be used for dot files.
+ *
+ * @param pcollectionImpl The PCollectionImpl to be formatted
+ * @param jobPrototype The job containing the PCollection
+ * @return The dot file formatted representation of the PCollectionImpl
+ */
+ String formatPCollection(PCollectionImpl<?> pcollectionImpl, JobPrototype jobPrototype) {
+ if (pcollectionImpl instanceof InputCollection) {
+ InputCollection<?> inputCollection = (InputCollection<?>) pcollectionImpl;
+ return String.format("\"%s\"", inputCollection.getSource());
+ }
+ return String.format("\"%s@%d@%d\"", pcollectionImpl.getName(), pcollectionImpl.hashCode(), jobPrototype.hashCode());
+ }
+
+ /**
+ * Format a collection of node strings into dot file syntax.
+ *
+ * @param nodeCollection Collection of chained node strings
+ * @return The dot-formatted chain of nodes
+ */
+ String formatNodeCollection(List<String> nodeCollection) {
+ return String.format("%s;", Joiner.on(" -> ").join(nodeCollection));
+ }
+
+ /**
+ * Format a NodePath in dot file syntax.
+ *
+ * @param nodePath The node path to be formatted
+ * @param jobPrototype The job containing the NodePath
+ * @return The dot file representation of the node path
+ */
+ List<String> formatNodePath(NodePath nodePath, JobPrototype jobPrototype) {
+ List<String> formattedNodePaths = Lists.newArrayList();
+
+ List<PCollectionImpl<?>> pcollections = Lists.newArrayList(nodePath);
+ for (int collectionIndex = 1; collectionIndex < pcollections.size(); collectionIndex++){
+ String fromNode = formatPCollection(pcollections.get(collectionIndex - 1), jobPrototype);
+ String toNode = formatPCollection(pcollections.get(collectionIndex), jobPrototype);
+ formattedNodePaths.add(formatNodeCollection(Lists.newArrayList(fromNode, toNode)));
+ }
+ return formattedNodePaths;
+ }
+
+ /**
+ * Add a NodePath to be formatted as a list of node declarations within a
+ * single job.
+ *
+ * @param jobPrototype The job containing the node path
+ * @param nodePath The node path to be formatted
+ */
+ void addNodePathDeclarations(JobPrototype jobPrototype, NodePath nodePath) {
+ boolean groupingEncountered = false;
+ for (PCollectionImpl<?> pcollectionImpl : nodePath) {
+ if (pcollectionImpl instanceof InputCollection) {
+ globalNodeDeclarations.add(formatPCollectionNodeDeclaration(pcollectionImpl, jobPrototype));
+ } else {
+ if (!groupingEncountered){
+ groupingEncountered = (pcollectionImpl instanceof PGroupedTableImpl);
+ }
+
+ MRTaskType taskType = groupingEncountered ? MRTaskType.REDUCE : MRTaskType.MAP;
+ jobNodeDeclarations.put(Pair.of(jobPrototype, taskType), formatPCollectionNodeDeclaration(pcollectionImpl, jobPrototype));
+ }
+ }
+ }
+
+ /**
+ * Add the chaining of a NodePath to the graph.
+ *
+ * @param nodePath The path to be formatted as a node chain in the dot file
+ * @param jobPrototype The job containing the NodePath
+ */
+ void addNodePathChain(NodePath nodePath, JobPrototype jobPrototype) {
+ for (String nodePathChain : formatNodePath(nodePath, jobPrototype)){
+ this.nodePathChains.add(nodePathChain);
+ }
+ }
+
+ /**
+ * Get the graph attributes for a task-specific subgraph.
+ *
+ * @param taskType The type of task in the subgraph
+ * @return Graph attributes
+ */
+ String getTaskGraphAttributes(MRTaskType taskType) {
+ if (taskType == MRTaskType.MAP) {
+ return "label = Map; color = blue;";
+ } else {
+ return "label = Reduce; color = red;";
+ }
+ }
+
+ /**
+ * Add the contents of a {@link JobPrototype} to the graph describing a
+ * pipeline.
+ *
+ * @param jobPrototype A JobPrototype representing a portion of a MapReduce
+ * pipeline
+ */
+ public void addJobPrototype(JobPrototype jobPrototype) {
+ jobPrototypes.add(jobPrototype);
+ if (!jobPrototype.isMapOnly()) {
+ for (NodePath nodePath : jobPrototype.getMapNodePaths()) {
+ addNodePathDeclarations(jobPrototype, nodePath);
+ addNodePathChain(nodePath, jobPrototype);
+ }
+ }
+
+ HashMultimap<Target, NodePath> targetsToNodePaths = jobPrototype.getTargetsToNodePaths();
+ for (Target target : targetsToNodePaths.keySet()) {
+ globalNodeDeclarations.add(formatTargetNodeDeclaration(target));
+ for (NodePath nodePath : targetsToNodePaths.get(target)) {
+ addNodePathDeclarations(jobPrototype, nodePath);
+ addNodePathChain(nodePath, jobPrototype);
+ nodePathChains.add(formatNodeCollection(Lists.newArrayList(formatPCollection(nodePath.descendingIterator()
+ .next(), jobPrototype), String.format("\"%s\"", target.toString()))));
+ }
+ }
+ }
+
+ /**
+ * Build up the full dot file containing the description of a MapReduce
+ * pipeline.
+ *
+ * @return Graphviz dot file contents
+ */
+ public String buildDotfile() {
+ StringBuilder stringBuilder = new StringBuilder();
+ stringBuilder.append("digraph G {\n");
+ int clusterIndex = 0;
+
+ for (String globalDeclaration : globalNodeDeclarations) {
+ stringBuilder.append(String.format(" %s\n", globalDeclaration));
+ }
+
+ for (JobPrototype jobPrototype : jobPrototypes){
+ StringBuilder jobProtoStringBuilder = new StringBuilder();
+ jobProtoStringBuilder.append(String.format(" subgraph cluster%d {\n", clusterIndex++));
+ for (MRTaskType taskType : MRTaskType.values()){
+ Pair<JobPrototype,MRTaskType> jobTaskKey = Pair.of(jobPrototype, taskType);
+ if (jobNodeDeclarations.containsKey(jobTaskKey)){
+ jobProtoStringBuilder.append(String.format(" subgraph cluster%d {\n", clusterIndex++));
+ jobProtoStringBuilder.append(String.format(" %s\n", getTaskGraphAttributes(taskType)));
+ for (String declarationEntry : jobNodeDeclarations.get(jobTaskKey)){
+ jobProtoStringBuilder.append(String.format(" %s\n", declarationEntry));
+ }
+ jobProtoStringBuilder.append(" }\n");
+ }
+ }
+ jobProtoStringBuilder.append(" }\n");
+ stringBuilder.append(jobProtoStringBuilder.toString());
+ }
+
+ for (String nodePathChain : nodePathChains) {
+ stringBuilder.append(String.format(" %s\n", nodePathChain));
+ }
+
+ stringBuilder.append("}\n");
+ return stringBuilder.toString();
+ }
+
+
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/Edge.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/Edge.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/Edge.java
new file mode 100644
index 0000000..1e59df0
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/Edge.java
@@ -0,0 +1,125 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.plan;
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.commons.lang.builder.ReflectionToStringBuilder;
+import org.apache.commons.lang.builder.ToStringStyle;
+import org.apache.crunch.impl.mr.collect.PCollectionImpl;
+import org.apache.crunch.impl.mr.collect.PGroupedTableImpl;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+/**
+ *
+ */
+class Edge {
+ private final Vertex head;
+ private final Vertex tail;
+ private final Set<NodePath> paths;
+
+ public Edge(Vertex head, Vertex tail) {
+ this.head = head;
+ this.tail = tail;
+ this.paths = Sets.newHashSet();
+ }
+
+ public Vertex getHead() {
+ return head;
+ }
+
+ public Vertex getTail() {
+ return tail;
+ }
+
+ public void addNodePath(NodePath path) {
+ this.paths.add(path);
+ }
+
+ public void addAllNodePaths(Collection<NodePath> paths) {
+ this.paths.addAll(paths);
+ }
+
+ public Set<NodePath> getNodePaths() {
+ return paths;
+ }
+
+ public PCollectionImpl getSplit() {
+ List<Iterator<PCollectionImpl<?>>> iters = Lists.newArrayList();
+ for (NodePath nodePath : paths) {
+ Iterator<PCollectionImpl<?>> iter = nodePath.iterator();
+ iter.next(); // prime this past the initial NGroupedTableImpl
+ iters.add(iter);
+ }
+
+ // Find the lowest point w/the lowest cost to be the split point for
+ // all of the dependent paths.
+ boolean end = false;
+ int splitIndex = -1;
+ while (!end) {
+ splitIndex++;
+ PCollectionImpl<?> current = null;
+ for (Iterator<PCollectionImpl<?>> iter : iters) {
+ if (iter.hasNext()) {
+ PCollectionImpl<?> next = iter.next();
+ if (next instanceof PGroupedTableImpl) {
+ end = true;
+ break;
+ } else if (current == null) {
+ current = next;
+ } else if (current != next) {
+ end = true;
+ break;
+ }
+ } else {
+ end = true;
+ break;
+ }
+ }
+ }
+ // TODO: Add costing calcs here.
+
+ return Iterables.getFirst(paths, null).get(splitIndex);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == null || !(other instanceof Edge)) {
+ return false;
+ }
+ Edge e = (Edge) other;
+ return head.equals(e.head) && tail.equals(e.tail) && paths.equals(e.paths);
+ }
+
+ @Override
+ public int hashCode() {
+ return new HashCodeBuilder().append(head).append(tail).toHashCode();
+ }
+
+ @Override
+ public String toString() {
+ return ReflectionToStringBuilder.toString(this, ToStringStyle.SHORT_PREFIX_STYLE);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/Graph.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/Graph.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/Graph.java
new file mode 100644
index 0000000..ce0a847
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/Graph.java
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.plan;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.impl.mr.collect.PCollectionImpl;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+
+/**
+ *
+ */
+class Graph implements Iterable<Vertex> {
+
+ private final Map<PCollectionImpl, Vertex> vertices;
+ private final Map<Pair<Vertex, Vertex>, Edge> edges;
+ private final Map<Vertex, List<Vertex>> dependencies;
+
+ public Graph() {
+ this.vertices = Maps.newHashMap();
+ this.edges = Maps.newHashMap();
+ this.dependencies = Maps.newHashMap();
+ }
+
+ public Vertex getVertexAt(PCollectionImpl impl) {
+ return vertices.get(impl);
+ }
+
+ public Vertex addVertex(PCollectionImpl impl, boolean output) {
+ if (vertices.containsKey(impl)) {
+ Vertex v = vertices.get(impl);
+ if (output) {
+ v.setOutput();
+ }
+ return v;
+ }
+ Vertex v = new Vertex(impl);
+ vertices.put(impl, v);
+ if (output) {
+ v.setOutput();
+ }
+ return v;
+ }
+
+ public Edge getEdge(Vertex head, Vertex tail) {
+ Pair<Vertex, Vertex> p = Pair.of(head, tail);
+ if (edges.containsKey(p)) {
+ return edges.get(p);
+ }
+
+ Edge e = new Edge(head, tail);
+ edges.put(p, e);
+ tail.addIncoming(e);
+ head.addOutgoing(e);
+ return e;
+ }
+
+ @Override
+ public Iterator<Vertex> iterator() {
+ return Sets.newHashSet(vertices.values()).iterator();
+ }
+
+ public Set<Edge> getAllEdges() {
+ return Sets.newHashSet(edges.values());
+ }
+
+ public void markDependency(Vertex child, Vertex parent) {
+ List<Vertex> parents = dependencies.get(child);
+ if (parents == null) {
+ parents = Lists.newArrayList();
+ dependencies.put(child, parents);
+ }
+ parents.add(parent);
+ }
+
+ public List<Vertex> getParents(Vertex child) {
+ if (dependencies.containsKey(child)) {
+ return dependencies.get(child);
+ }
+ return ImmutableList.of();
+ }
+
+ public List<List<Vertex>> connectedComponents() {
+ List<List<Vertex>> components = Lists.newArrayList();
+ Set<Vertex> unassigned = Sets.newHashSet(vertices.values());
+ while (!unassigned.isEmpty()) {
+ Vertex base = unassigned.iterator().next();
+ List<Vertex> component = Lists.newArrayList();
+ component.add(base);
+ unassigned.remove(base);
+ Set<Vertex> working = Sets.newHashSet(base.getAllNeighbors());
+ while (!working.isEmpty()) {
+ Vertex n = working.iterator().next();
+ working.remove(n);
+ if (unassigned.contains(n)) {
+ component.add(n);
+ unassigned.remove(n);
+ for (Vertex n2 : n.getAllNeighbors()) {
+ if (unassigned.contains(n2)) {
+ working.add(n2);
+ }
+ }
+ }
+ }
+ components.add(component);
+ }
+
+ return components;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/GraphBuilder.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/GraphBuilder.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/GraphBuilder.java
new file mode 100644
index 0000000..925c39a
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/GraphBuilder.java
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.plan;
+
+import org.apache.crunch.impl.mr.collect.DoCollectionImpl;
+import org.apache.crunch.impl.mr.collect.DoTableImpl;
+import org.apache.crunch.impl.mr.collect.InputCollection;
+import org.apache.crunch.impl.mr.collect.PCollectionImpl;
+import org.apache.crunch.impl.mr.collect.PGroupedTableImpl;
+import org.apache.crunch.impl.mr.collect.UnionCollection;
+
+/**
+ *
+ */
+class GraphBuilder implements PCollectionImpl.Visitor {
+
+ private Graph graph = new Graph();
+ private Vertex workingVertex;
+ private NodePath workingPath;
+
+ public Graph getGraph() {
+ return graph;
+ }
+
+ public void visitOutput(PCollectionImpl<?> output) {
+ workingVertex = graph.addVertex(output, true);
+ workingPath = new NodePath();
+ output.accept(this);
+ }
+
+ @Override
+ public void visitInputCollection(InputCollection<?> collection) {
+ Vertex v = graph.addVertex(collection, false);
+ graph.getEdge(v, workingVertex).addNodePath(workingPath.close(collection));
+ }
+
+ @Override
+ public void visitUnionCollection(UnionCollection<?> collection) {
+ Vertex baseVertex = workingVertex;
+ NodePath basePath = workingPath;
+ for (PCollectionImpl<?> parent : collection.getParents()) {
+ workingPath = new NodePath(basePath);
+ workingVertex = baseVertex;
+ processParent(parent);
+ }
+ }
+
+ @Override
+ public void visitDoFnCollection(DoCollectionImpl<?> collection) {
+ workingPath.push(collection);
+ processParent(collection.getOnlyParent());
+ }
+
+ @Override
+ public void visitDoTable(DoTableImpl<?, ?> collection) {
+ workingPath.push(collection);
+ processParent(collection.getOnlyParent());
+ }
+
+ @Override
+ public void visitGroupedTable(PGroupedTableImpl<?, ?> collection) {
+ Vertex v = graph.addVertex(collection, false);
+ graph.getEdge(v, workingVertex).addNodePath(workingPath.close(collection));
+ workingVertex = v;
+ workingPath = new NodePath(collection);
+ processParent(collection.getOnlyParent());
+ }
+
+ private void processParent(PCollectionImpl<?> parent) {
+ Vertex v = graph.getVertexAt(parent);
+ if (v == null) {
+ parent.accept(this);
+ } else {
+ graph.getEdge(v, workingVertex).addNodePath(workingPath.close(parent));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/JobNameBuilder.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/JobNameBuilder.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/JobNameBuilder.java
new file mode 100644
index 0000000..9ad7300
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/JobNameBuilder.java
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.plan;
+
+import java.util.List;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.Lists;
+
+/**
+ * Visitor that traverses the {@code DoNode} instances in a job and builds a
+ * String that identifies the stages of the pipeline that belong to this job.
+ */
+class JobNameBuilder {
+
+ private static final Joiner JOINER = Joiner.on("+");
+ private static final Joiner CHILD_JOINER = Joiner.on("/");
+
+ private String pipelineName;
+ List<String> rootStack = Lists.newArrayList();
+
+ public JobNameBuilder(final String pipelineName) {
+ this.pipelineName = pipelineName;
+ }
+
+ public void visit(DoNode node) {
+ visit(node, rootStack);
+ }
+
+ public void visit(List<DoNode> nodes) {
+ visit(nodes, rootStack);
+ }
+
+ private void visit(List<DoNode> nodes, List<String> stack) {
+ if (nodes.size() == 1) {
+ visit(nodes.get(0), stack);
+ } else {
+ List<String> childStack = Lists.newArrayList();
+ for (int i = 0; i < nodes.size(); i++) {
+ DoNode node = nodes.get(i);
+ List<String> subStack = Lists.newArrayList();
+ visit(node, subStack);
+ if (!subStack.isEmpty()) {
+ childStack.add("[" + JOINER.join(subStack) + "]");
+ }
+ }
+ if (!childStack.isEmpty()) {
+ stack.add("[" + CHILD_JOINER.join(childStack) + "]");
+ }
+ }
+ }
+
+ private void visit(DoNode node, List<String> stack) {
+ String name = node.getName();
+ if (!name.isEmpty()) {
+ stack.add(node.getName());
+ }
+ visit(node.getChildren(), stack);
+ }
+
+ public String build() {
+ return String.format("%s: %s", pipelineName, JOINER.join(rootStack));
+ }
+}
[41/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/fn/AggregatorsIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/fn/AggregatorsIT.java b/crunch-core/src/it/java/org/apache/crunch/fn/AggregatorsIT.java
new file mode 100644
index 0000000..c9584a1
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/fn/AggregatorsIT.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import static org.apache.crunch.fn.Aggregators.SUM_INTS;
+import static org.apache.crunch.fn.Aggregators.pairAggregator;
+import static org.apache.crunch.types.writable.Writables.ints;
+import static org.apache.crunch.types.writable.Writables.pairs;
+import static org.apache.crunch.types.writable.Writables.strings;
+import static org.apache.crunch.types.writable.Writables.tableOf;
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.assertThat;
+
+import java.util.Collection;
+import java.util.Map;
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.test.Tests;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+
+@RunWith(Parameterized.class)
+public class AggregatorsIT {
+ private Pipeline pipeline;
+
+ @Parameters
+ public static Collection<Object[]> params() {
+ return Tests.pipelinesParams(AggregatorsIT.class);
+ }
+
+ public AggregatorsIT(Pipeline pipeline) {
+ this.pipeline = pipeline;
+ }
+
+ @Test
+ public void testPairAggregator() {
+ PCollection<String> lines = pipeline.readTextFile(Tests.pathTo(this, "ints.txt"));
+
+ PTable<String, Pair<Integer, Integer>> table = lines.parallelDo(new SplitLine(),
+ tableOf(strings(), pairs(ints(), ints())));
+
+ PTable<String, Pair<Integer, Integer>> combinedTable = table.groupByKey().combineValues(
+ pairAggregator(SUM_INTS(), SUM_INTS()));
+
+ Map<String, Pair<Integer, Integer>> result = combinedTable.asMap().getValue();
+
+ assertThat(result.size(), is(2));
+ assertThat(result.get("a"), is(Pair.of(9, 12)));
+ assertThat(result.get("b"), is(Pair.of(11, 13)));
+ }
+
+ private static final class SplitLine extends MapFn<String, Pair<String, Pair<Integer, Integer>>> {
+ @Override
+ public Pair<String, Pair<Integer, Integer>> map(String input) {
+ String[] split = input.split("\t");
+ return Pair.of(split[0],
+ Pair.of(Integer.parseInt(split[1]), Integer.parseInt(split[2])));
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/impl/mem/MemPipelineFileWritingIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/impl/mem/MemPipelineFileWritingIT.java b/crunch-core/src/it/java/org/apache/crunch/impl/mem/MemPipelineFileWritingIT.java
new file mode 100644
index 0000000..976a43e
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/impl/mem/MemPipelineFileWritingIT.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mem;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.util.List;
+
+import org.apache.crunch.PCollection;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.base.Charsets;
+import com.google.common.collect.ImmutableList;
+import com.google.common.io.Files;
+
+public class MemPipelineFileWritingIT {
+ @Rule
+ public TemporaryPath baseTmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testMemPipelineFileWriter() throws Exception {
+ File tmpDir = baseTmpDir.getFile("mempipe");
+ Pipeline p = MemPipeline.getInstance();
+ PCollection<String> lines = MemPipeline.collectionOf("hello", "world");
+ p.writeTextFile(lines, tmpDir.toString());
+ p.done();
+ assertTrue(tmpDir.exists());
+ File[] files = tmpDir.listFiles();
+ assertTrue(files != null && files.length > 0);
+ for (File f : files) {
+ if (!f.getName().startsWith(".")) {
+ List<String> txt = Files.readLines(f, Charsets.UTF_8);
+ assertEquals(ImmutableList.of("hello", "world"), txt);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/impl/mr/collect/UnionCollectionIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/impl/mr/collect/UnionCollectionIT.java b/crunch-core/src/it/java/org/apache/crunch/impl/mr/collect/UnionCollectionIT.java
new file mode 100644
index 0000000..f9f73b2
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/impl/mr/collect/UnionCollectionIT.java
@@ -0,0 +1,154 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.collect;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTableKeyValueIT;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.At;
+import org.apache.crunch.io.To;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import com.google.common.collect.Lists;
+
+@RunWith(value = Parameterized.class)
+public class UnionCollectionIT {
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ private static final Log LOG = LogFactory.getLog(UnionCollectionIT.class);
+
+ private PTypeFamily typeFamily;
+ private Pipeline pipeline;
+ private PCollection<String> union;
+
+ private ArrayList<String> EXPECTED = Lists.newArrayList("a", "a", "b", "c", "c", "d", "e");
+
+ private Class pipelineClass;
+
+ @Before
+ @SuppressWarnings("unchecked")
+ public void setUp() throws IOException {
+ String inputFile1 = tmpDir.copyResourceFileName("set1.txt");
+ String inputFile2 = tmpDir.copyResourceFileName("set2.txt");
+ if (pipelineClass == null) {
+ pipeline = MemPipeline.getInstance();
+ } else {
+ pipeline = new MRPipeline(pipelineClass, tmpDir.getDefaultConfiguration());
+ }
+ PCollection<String> firstCollection = pipeline.read(At.textFile(inputFile1, typeFamily.strings()));
+ PCollection<String> secondCollection = pipeline.read(At.textFile(inputFile2, typeFamily.strings()));
+
+ LOG.info("Test fixture: [" + pipeline.getClass().getSimpleName() + " : " + typeFamily.getClass().getSimpleName()
+ + "] First: " + Lists.newArrayList(firstCollection.materialize().iterator()) + ", Second: "
+ + Lists.newArrayList(secondCollection.materialize().iterator()));
+
+ union = secondCollection.union(firstCollection);
+ }
+
+ @Parameters
+ public static Collection<Object[]> data() throws IOException {
+ Object[][] data = new Object[][] { { WritableTypeFamily.getInstance(), PTableKeyValueIT.class },
+ { WritableTypeFamily.getInstance(), null }, { AvroTypeFamily.getInstance(), PTableKeyValueIT.class },
+ { AvroTypeFamily.getInstance(), null } };
+ return Arrays.asList(data);
+ }
+
+ public UnionCollectionIT(PTypeFamily typeFamily, Class pipelineClass) {
+ this.typeFamily = typeFamily;
+ this.pipelineClass = pipelineClass;
+ }
+
+ @Test
+ public void unionMaterializeShouldNotThrowNPE() throws Exception {
+ checkMaterialized(union.materialize());
+ checkMaterialized(pipeline.materialize(union));
+ }
+
+ private void checkMaterialized(Iterable<String> materialized) {
+ List<String> materializedValues = Lists.newArrayList(materialized.iterator());
+ Collections.sort(materializedValues);
+ LOG.info("Materialized union: " + materializedValues);
+ assertEquals(EXPECTED, materializedValues);
+ }
+
+ @Test
+ public void unionWriteShouldNotThrowNPE() throws IOException {
+ String outputPath1 = tmpDir.getFileName("output1");
+ String outputPath2 = tmpDir.getFileName("output2");
+ String outputPath3 = tmpDir.getFileName("output3");
+
+ if (typeFamily == AvroTypeFamily.getInstance()) {
+ union.write(To.avroFile(outputPath1));
+ pipeline.write(union, To.avroFile(outputPath2));
+
+ pipeline.run();
+
+ checkFileContents(outputPath1);
+ checkFileContents(outputPath2);
+
+ } else {
+
+ union.write(To.textFile(outputPath1));
+ pipeline.write(union, To.textFile(outputPath2));
+ pipeline.writeTextFile(union, outputPath3);
+
+ pipeline.run();
+
+ checkFileContents(outputPath1);
+ checkFileContents(outputPath2);
+ checkFileContents(outputPath3);
+ }
+ }
+
+ private void checkFileContents(String filePath) throws IOException {
+
+ List<String> fileContentValues = (typeFamily != AvroTypeFamily.getInstance() || !(pipeline instanceof MRPipeline)) ? Lists
+ .newArrayList(pipeline.read(At.textFile(filePath, typeFamily.strings())).materialize().iterator()) : Lists
+ .newArrayList(pipeline.read(At.avroFile(filePath, Avros.strings())).materialize().iterator());
+
+ Collections.sort(fileContentValues);
+
+ LOG.info("Saved Union: " + fileContentValues);
+ assertEquals(EXPECTED, fileContentValues);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/io/CompositePathIterableIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/io/CompositePathIterableIT.java b/crunch-core/src/it/java/org/apache/crunch/io/CompositePathIterableIT.java
new file mode 100644
index 0000000..08d226d
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/io/CompositePathIterableIT.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.crunch.io.text.TextFileReaderFactory;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.fs.Path;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class CompositePathIterableIT {
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testCreate_FilePresent() throws IOException {
+ String inputFilePath = tmpDir.copyResourceFileName("set1.txt");
+ Configuration conf = new Configuration();
+ LocalFileSystem local = FileSystem.getLocal(conf);
+
+ Iterable<String> iterable = CompositePathIterable.create(local, new Path(inputFilePath),
+ new TextFileReaderFactory<String>(Writables.strings()));
+
+ assertEquals(Lists.newArrayList("b", "c", "a", "e"), Lists.newArrayList(iterable));
+
+ }
+
+ @Test
+ public void testCreate_DirectoryPresentButNoFiles() throws IOException {
+ Path emptyInputDir = tmpDir.getRootPath();
+
+ Configuration conf = new Configuration();
+ LocalFileSystem local = FileSystem.getLocal(conf);
+
+ Iterable<String> iterable = CompositePathIterable.create(local, emptyInputDir,
+ new TextFileReaderFactory<String>(Writables.strings()));
+
+ assertTrue(Lists.newArrayList(iterable).isEmpty());
+ }
+
+ @Test(expected = IOException.class)
+ public void testCreate_DirectoryNotPresent() throws IOException {
+ File nonExistentDir = tmpDir.getFile("not-there");
+
+ // Sanity check
+ assertFalse(nonExistentDir.exists());
+
+ Configuration conf = new Configuration();
+ LocalFileSystem local = FileSystem.getLocal(conf);
+
+ CompositePathIterable.create(local, new Path(nonExistentDir.getAbsolutePath()), new TextFileReaderFactory<String>(
+ Writables.strings()));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/io/NLineInputIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/io/NLineInputIT.java b/crunch-core/src/it/java/org/apache/crunch/io/NLineInputIT.java
new file mode 100644
index 0000000..52b8ff5
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/io/NLineInputIT.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.text.NLineFileSource;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Rule;
+import org.junit.Test;
+
+public class NLineInputIT {
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testNLine() throws Exception {
+ String urlsInputPath = tmpDir.copyResourceFileName("urls.txt");
+ Configuration conf = new Configuration(tmpDir.getDefaultConfiguration());
+ conf.setInt("io.sort.mb", 10);
+ Pipeline pipeline = new MRPipeline(NLineInputIT.class, conf);
+ PCollection<String> urls = pipeline.read(new NLineFileSource<String>(urlsInputPath,
+ Writables.strings(), 2));
+ assertEquals(new Integer(2),
+ urls.parallelDo(new LineCountFn(), Avros.ints()).max().getValue());
+ }
+
+ private static class LineCountFn extends DoFn<String, Integer> {
+
+ private int lineCount = 0;
+
+ @Override
+ public void initialize() {
+ this.lineCount = 0;
+ }
+
+ @Override
+ public void process(String input, Emitter<Integer> emitter) {
+ lineCount++;
+ }
+
+ @Override
+ public void cleanup(Emitter<Integer> emitter) {
+ emitter.emit(lineCount);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/io/TextFileTableIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/io/TextFileTableIT.java b/crunch-core/src/it/java/org/apache/crunch/io/TextFileTableIT.java
new file mode 100644
index 0000000..bddc0b5
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/io/TextFileTableIT.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import static org.apache.crunch.types.writable.Writables.*;
+import static org.junit.Assert.assertEquals;
+
+import java.util.Set;
+
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.text.TextFileTableSource;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableSet;
+
+/**
+ *
+ */
+public class TextFileTableIT {
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testTextFileTable() throws Exception {
+ String urlsFile = tmpDir.copyResourceFileName("urls.txt");
+ Pipeline pipeline = new MRPipeline(TextFileTableIT.class, tmpDir.getDefaultConfiguration());
+ PTable<String, String> urls = pipeline.read(
+ new TextFileTableSource<String, String>(urlsFile, tableOf(strings(), strings())));
+ Set<Pair<String, Long>> cnts = ImmutableSet.copyOf(urls.keys().count().materialize());
+ assertEquals(ImmutableSet.of(Pair.of("www.A.com", 4L), Pair.of("www.B.com", 2L),
+ Pair.of("www.C.com", 1L), Pair.of("www.D.com", 1L), Pair.of("www.E.com", 1L),
+ Pair.of("www.F.com", 2L)), cnts);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroFileSourceTargetIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroFileSourceTargetIT.java b/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroFileSourceTargetIT.java
new file mode 100644
index 0000000..671b920
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroFileSourceTargetIT.java
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.avro;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.List;
+
+import org.apache.avro.Schema;
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericData.Record;
+import org.apache.avro.generic.GenericDatumWriter;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.reflect.ReflectData;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.At;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.avro.Avros;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+@SuppressWarnings("serial")
+public class AvroFileSourceTargetIT implements Serializable {
+
+ private transient File avroFile;
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Before
+ public void setUp() throws IOException {
+ avroFile = tmpDir.getFile("test.avro");
+ }
+
+ private void populateGenericFile(List<GenericRecord> genericRecords, Schema schema) throws IOException {
+ FileOutputStream outputStream = new FileOutputStream(this.avroFile);
+ GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<GenericRecord>(schema);
+
+ DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(genericDatumWriter);
+ dataFileWriter.create(schema, outputStream);
+
+ for (GenericRecord record : genericRecords) {
+ dataFileWriter.append(record);
+ }
+
+ dataFileWriter.close();
+ outputStream.close();
+
+ }
+
+ @Test
+ public void testSpecific() throws IOException {
+ GenericRecord savedRecord = new GenericData.Record(Person.SCHEMA$);
+ savedRecord.put("name", "John Doe");
+ savedRecord.put("age", 42);
+ savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
+ populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);
+
+ Pipeline pipeline = new MRPipeline(AvroFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
+ PCollection<Person> genericCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
+ Avros.records(Person.class)));
+
+ List<Person> personList = Lists.newArrayList(genericCollection.materialize());
+
+ Person expectedPerson = new Person();
+ expectedPerson.name = "John Doe";
+ expectedPerson.age = 42;
+
+ List<CharSequence> siblingNames = Lists.newArrayList();
+ siblingNames.add("Jimmy");
+ siblingNames.add("Jane");
+ expectedPerson.siblingnames = siblingNames;
+
+ assertEquals(Lists.newArrayList(expectedPerson), Lists.newArrayList(personList));
+ }
+
+ @Test
+ public void testGeneric() throws IOException {
+ String genericSchemaJson = Person.SCHEMA$.toString().replace("Person", "GenericPerson");
+ Schema genericPersonSchema = new Schema.Parser().parse(genericSchemaJson);
+ GenericRecord savedRecord = new GenericData.Record(genericPersonSchema);
+ savedRecord.put("name", "John Doe");
+ savedRecord.put("age", 42);
+ savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
+ populateGenericFile(Lists.newArrayList(savedRecord), genericPersonSchema);
+
+ Pipeline pipeline = new MRPipeline(AvroFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
+ PCollection<Record> genericCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
+ Avros.generics(genericPersonSchema)));
+
+ List<Record> recordList = Lists.newArrayList(genericCollection.materialize());
+
+ assertEquals(Lists.newArrayList(savedRecord), Lists.newArrayList(recordList));
+ }
+
+ @Test
+ public void testReflect() throws IOException {
+ Schema pojoPersonSchema = ReflectData.get().getSchema(StringWrapper.class);
+ GenericRecord savedRecord = new GenericData.Record(pojoPersonSchema);
+ savedRecord.put("value", "stringvalue");
+ populateGenericFile(Lists.newArrayList(savedRecord), pojoPersonSchema);
+
+ Pipeline pipeline = new MRPipeline(AvroFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
+ PCollection<StringWrapper> stringValueCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
+ Avros.reflects(StringWrapper.class)));
+
+ List<StringWrapper> recordList = Lists.newArrayList(stringValueCollection.materialize());
+
+ assertEquals(1, recordList.size());
+ StringWrapper stringWrapper = recordList.get(0);
+ assertEquals("stringvalue", stringWrapper.getValue());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroPipelineIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroPipelineIT.java b/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroPipelineIT.java
new file mode 100644
index 0000000..29bf4f5
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroPipelineIT.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.crunch.io.avro;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.List;
+
+import org.apache.avro.Schema;
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericDatumWriter;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.commons.io.FileUtils;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.Target;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.At;
+import org.apache.crunch.io.To;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.avro.Avros;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class AvroPipelineIT implements Serializable {
+
+ private transient File avroFile;
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Before
+ public void setUp() throws IOException {
+ avroFile = tmpDir.getFile("test.avro");
+ }
+
+ private void populateGenericFile(List<GenericRecord> genericRecords, Schema schema) throws IOException {
+ FileOutputStream outputStream = new FileOutputStream(this.avroFile);
+ GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<GenericRecord>(schema);
+
+ DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(genericDatumWriter);
+ dataFileWriter.create(schema, outputStream);
+
+ for (GenericRecord record : genericRecords) {
+ dataFileWriter.append(record);
+ }
+
+ dataFileWriter.close();
+ outputStream.close();
+
+ }
+
+ @Test
+ public void toTextShouldWriteAvroDataAsDatumText() throws Exception {
+ GenericRecord savedRecord = new GenericData.Record(Person.SCHEMA$);
+ savedRecord.put("name", "John Doe");
+ savedRecord.put("age", 42);
+ savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
+ populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);
+
+ Pipeline pipeline = new MRPipeline(AvroFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
+ PCollection<Person> genericCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
+ Avros.records(Person.class)));
+ File outputFile = tmpDir.getFile("output");
+ Target textFile = To.textFile(outputFile.getAbsolutePath());
+ pipeline.write(genericCollection, textFile);
+ pipeline.run();
+ Person person = genericCollection.materialize().iterator().next();
+ String outputString = FileUtils.readFileToString(new File(outputFile, "part-m-00000"));
+ assertTrue(outputString.contains(person.toString()));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroReflectIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroReflectIT.java b/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroReflectIT.java
new file mode 100644
index 0000000..7a90517
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroReflectIT.java
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.avro;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.lib.Aggregate;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.avro.Avros;
+import org.junit.Assume;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class AvroReflectIT implements Serializable {
+
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testReflection() throws IOException {
+ Pipeline pipeline = new MRPipeline(AvroReflectIT.class, tmpDir.getDefaultConfiguration());
+ PCollection<StringWrapper> stringWrapperCollection = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt"))
+ .parallelDo(new MapFn<String, StringWrapper>() {
+
+ @Override
+ public StringWrapper map(String input) {
+ StringWrapper stringWrapper = new StringWrapper();
+ stringWrapper.setValue(input);
+ return stringWrapper;
+ }
+ }, Avros.reflects(StringWrapper.class));
+
+ List<StringWrapper> stringWrappers = Lists.newArrayList(stringWrapperCollection.materialize());
+
+ pipeline.done();
+
+ assertEquals(Lists.newArrayList(new StringWrapper("b"), new StringWrapper("c"), new StringWrapper("a"),
+ new StringWrapper("e")), stringWrappers);
+
+ }
+
+ // Verify that running with a combination of reflect and specific schema
+ // doesn't crash
+ @Test
+ public void testCombinationOfReflectionAndSpecific() throws IOException {
+ Assume.assumeTrue(Avros.CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS);
+ Pipeline pipeline = new MRPipeline(AvroReflectIT.class, tmpDir.getDefaultConfiguration());
+ PCollection<Pair<StringWrapper, Person>> hybridPairCollection = pipeline.readTextFile(
+ tmpDir.copyResourceFileName("set1.txt")).parallelDo(new MapFn<String, Pair<StringWrapper, Person>>() {
+
+ @Override
+ public Pair<StringWrapper, Person> map(String input) {
+ Person person = new Person();
+ person.name = input;
+ person.age = 42;
+ person.siblingnames = Lists.<CharSequence> newArrayList(input);
+
+ return Pair.of(new StringWrapper(input), person);
+ }
+ }, Avros.pairs(Avros.reflects(StringWrapper.class), Avros.records(Person.class)));
+
+ PCollection<Pair<String, Long>> countCollection = Aggregate.count(hybridPairCollection).parallelDo(
+ new MapFn<Pair<Pair<StringWrapper, Person>, Long>, Pair<String, Long>>() {
+
+ @Override
+ public Pair<String, Long> map(Pair<Pair<StringWrapper, Person>, Long> input) {
+ return Pair.of(input.first().first().getValue(), input.second());
+ }
+ }, Avros.pairs(Avros.strings(), Avros.longs()));
+
+ List<Pair<String, Long>> materialized = Lists.newArrayList(countCollection.materialize());
+ List<Pair<String, Long>> expected = Lists.newArrayList(Pair.of("a", 1L), Pair.of("b", 1L), Pair.of("c", 1L),
+ Pair.of("e", 1L));
+ Collections.sort(materialized);
+
+ assertEquals(expected, materialized);
+ pipeline.done();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroWritableIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroWritableIT.java b/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroWritableIT.java
new file mode 100644
index 0000000..cbb7fde
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/io/avro/AvroWritableIT.java
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.avro;
+
+import static org.apache.crunch.types.avro.Avros.ints;
+import static org.apache.crunch.types.avro.Avros.tableOf;
+import static org.apache.crunch.types.avro.Avros.writables;
+import static org.junit.Assert.assertEquals;
+
+import java.io.Serializable;
+import java.util.Map;
+
+import org.apache.crunch.CombineFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.hadoop.io.DoubleWritable;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Maps;
+
+/**
+ * Verify handling of both a ByteBuffer and byte array as input from an Avro job (depending
+ * on the version of Avro being used).
+ */
+public class AvroWritableIT implements Serializable {
+
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testAvroBasedWritablePipeline() throws Exception {
+ String customersInputPath = tmpDir.copyResourceFileName("customers.txt");
+ Pipeline pipeline = new MRPipeline(AvroWritableIT.class, tmpDir.getDefaultConfiguration());
+ pipeline.enableDebug();
+ PCollection<String> customerLines = pipeline.readTextFile(customersInputPath);
+ Map<Integer, DoubleWritable> outputMap = customerLines.parallelDo(
+ new MapFn<String, Pair<Integer, DoubleWritable>>() {
+ @Override
+ public Pair<Integer, DoubleWritable> map(String input) {
+ int len = input.length();
+ return Pair.of(len, new DoubleWritable(len));
+ }
+ }, tableOf(ints(), writables(DoubleWritable.class)))
+ .groupByKey()
+ .combineValues(new CombineFn<Integer, DoubleWritable>() {
+ @Override
+ public void process(Pair<Integer, Iterable<DoubleWritable>> input,
+ Emitter<Pair<Integer, DoubleWritable>> emitter) {
+ double sum = 0.0;
+ for (DoubleWritable dw : input.second()) {
+ sum += dw.get();
+ }
+ emitter.emit(Pair.of(input.first(), new DoubleWritable(sum)));
+ }
+ })
+ .materializeToMap();
+
+ Map<Integer, DoubleWritable> expectedMap = Maps.newHashMap();
+ expectedMap.put(17, new DoubleWritable(17.0));
+ expectedMap.put(16, new DoubleWritable(16.0));
+ expectedMap.put(12, new DoubleWritable(24.0));
+
+ assertEquals(expectedMap, outputMap);
+
+ pipeline.done();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/AggregateIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/AggregateIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/AggregateIT.java
new file mode 100644
index 0000000..56ee3ac
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/AggregateIT.java
@@ -0,0 +1,231 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static org.apache.crunch.types.writable.Writables.strings;
+import static org.apache.crunch.types.writable.Writables.tableOf;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map;
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.Employee;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.io.Text;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+
+public class AggregateIT {
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testWritables() throws Exception {
+ Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
+ String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
+ PCollection<String> shakes = pipeline.readTextFile(shakesInputPath);
+ runMinMax(shakes, WritableTypeFamily.getInstance());
+ pipeline.done();
+ }
+
+ @Test
+ public void testAvro() throws Exception {
+ Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
+ String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
+ PCollection<String> shakes = pipeline.readTextFile(shakesInputPath);
+ runMinMax(shakes, AvroTypeFamily.getInstance());
+ pipeline.done();
+ }
+
+ @Test
+ public void testInMemoryAvro() throws Exception {
+ PCollection<String> someText = MemPipeline.collectionOf("first line", "second line", "third line");
+ runMinMax(someText, AvroTypeFamily.getInstance());
+ }
+
+ public static void runMinMax(PCollection<String> shakes, PTypeFamily family) throws Exception {
+ PCollection<Integer> lengths = shakes.parallelDo(new MapFn<String, Integer>() {
+ @Override
+ public Integer map(String input) {
+ return input.length();
+ }
+ }, family.ints());
+ PCollection<Integer> negLengths = lengths.parallelDo(new MapFn<Integer, Integer>() {
+ @Override
+ public Integer map(Integer input) {
+ return -input;
+ }
+ }, family.ints());
+ Integer maxLengths = Aggregate.max(lengths).getValue();
+ Integer minLengths = Aggregate.min(negLengths).getValue();
+ assertTrue(maxLengths != null);
+ assertTrue(minLengths != null);
+ assertEquals(maxLengths.intValue(), -minLengths.intValue());
+ }
+
+ private static class SplitFn extends MapFn<String, Pair<String, String>> {
+ @Override
+ public Pair<String, String> map(String input) {
+ String[] p = input.split("\\s+");
+ return Pair.of(p[0], p[1]);
+ }
+ }
+
+ @Test
+ public void testCollectUrls() throws Exception {
+ Pipeline p = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
+ String urlsInputPath = tmpDir.copyResourceFileName("urls.txt");
+ PTable<String, Collection<String>> urls = Aggregate.collectValues(p.readTextFile(urlsInputPath).parallelDo(
+ new SplitFn(), tableOf(strings(), strings())));
+ for (Pair<String, Collection<String>> e : urls.materialize()) {
+ String key = e.first();
+ int expectedSize = 0;
+ if ("www.A.com".equals(key)) {
+ expectedSize = 4;
+ } else if ("www.B.com".equals(key) || "www.F.com".equals(key)) {
+ expectedSize = 2;
+ } else if ("www.C.com".equals(key) || "www.D.com".equals(key) || "www.E.com".equals(key)) {
+ expectedSize = 1;
+ }
+ assertEquals("Checking key = " + key, expectedSize, e.second().size());
+ p.done();
+ }
+ }
+
+ @Test
+ public void testTopN() throws Exception {
+ PTableType<String, Integer> ptype = Avros.tableOf(Avros.strings(), Avros.ints());
+ PTable<String, Integer> counts = MemPipeline.typedTableOf(ptype, "foo", 12, "bar", 17, "baz", 29);
+
+ PTable<String, Integer> top2 = Aggregate.top(counts, 2, true);
+ assertEquals(ImmutableList.of(Pair.of("baz", 29), Pair.of("bar", 17)), top2.materialize());
+
+ PTable<String, Integer> bottom2 = Aggregate.top(counts, 2, false);
+ assertEquals(ImmutableList.of(Pair.of("foo", 12), Pair.of("bar", 17)), bottom2.materialize());
+ }
+
+ @Test
+ public void testCollectValues_Writables() throws IOException {
+ Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
+ Map<Integer, Collection<Text>> collectionMap = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt"))
+ .parallelDo(new MapStringToTextPair(), Writables.tableOf(Writables.ints(), Writables.writables(Text.class)))
+ .collectValues().materializeToMap();
+
+ assertEquals(1, collectionMap.size());
+
+ assertTrue(collectionMap.get(1).containsAll(Lists.newArrayList(new Text("c"), new Text("d"), new Text("a"))));
+ }
+
+ @Test
+ public void testCollectValues_Avro() throws IOException {
+
+ MapStringToEmployeePair mapFn = new MapStringToEmployeePair();
+ Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
+ Map<Integer, Collection<Employee>> collectionMap = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt"))
+ .parallelDo(mapFn, Avros.tableOf(Avros.ints(), Avros.records(Employee.class))).collectValues()
+ .materializeToMap();
+
+ assertEquals(1, collectionMap.size());
+
+ Employee empC = mapFn.map("c").second();
+ Employee empD = mapFn.map("d").second();
+ Employee empA = mapFn.map("a").second();
+
+ assertTrue(collectionMap.get(1).containsAll(Lists.newArrayList(empC, empD, empA)));
+ }
+
+ private static class MapStringToTextPair extends MapFn<String, Pair<Integer, Text>> {
+ @Override
+ public Pair<Integer, Text> map(String input) {
+ return Pair.of(1, new Text(input));
+ }
+ }
+
+ private static class MapStringToEmployeePair extends MapFn<String, Pair<Integer, Employee>> {
+ @Override
+ public Pair<Integer, Employee> map(String input) {
+ Employee emp = new Employee();
+ emp.name = input;
+ emp.salary = 0;
+ emp.department = "";
+ return Pair.of(1, emp);
+ }
+ }
+
+ public static class PojoText {
+ private String value;
+
+ public PojoText() {
+ this("");
+ }
+
+ public PojoText(String value) {
+ this.value = value;
+ }
+
+ public String getValue() {
+ return value;
+ }
+
+ public void setValue(String value) {
+ this.value = value;
+ }
+
+ @Override
+ public String toString() {
+ return String.format("PojoText<%s>", this.value);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ PojoText other = (PojoText) obj;
+ if (value == null) {
+ if (other.value != null)
+ return false;
+ } else if (!value.equals(other.value))
+ return false;
+ return true;
+ }
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/AvroTypeSortIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/AvroTypeSortIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/AvroTypeSortIT.java
new file mode 100644
index 0000000..a832a5d
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/AvroTypeSortIT.java
@@ -0,0 +1,145 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static junit.framework.Assert.assertEquals;
+import static org.apache.crunch.types.avro.Avros.ints;
+import static org.apache.crunch.types.avro.Avros.records;
+import static org.apache.crunch.types.avro.Avros.strings;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.List;
+
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.specific.SpecificDatumWriter;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.At;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Test sorting Avro types by selected inner field
+ */
+public class AvroTypeSortIT implements Serializable {
+
+ private static final long serialVersionUID = 1344118240353796561L;
+
+ private transient File avroFile;
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Before
+ public void setUp() throws IOException {
+ avroFile = File.createTempFile("avrotest", ".avro");
+ }
+
+ @After
+ public void tearDown() {
+ avroFile.delete();
+ }
+
+ @Test
+ public void testSortAvroTypesBySelectedFields() throws Exception {
+
+ MRPipeline pipeline = new MRPipeline(AvroTypeSortIT.class, tmpDir.getDefaultConfiguration());
+
+ Person ccc10 = createPerson("CCC", 10);
+ Person bbb20 = createPerson("BBB", 20);
+ Person aaa30 = createPerson("AAA", 30);
+
+ writeAvroFile(Lists.newArrayList(ccc10, bbb20, aaa30), avroFile);
+
+ PCollection<Person> unsorted = pipeline.read(At.avroFile(avroFile.getAbsolutePath(), records(Person.class)));
+
+ // Sort by Name
+ MapFn<Person, String> nameExtractor = new MapFn<Person, String>() {
+
+ @Override
+ public String map(Person input) {
+ return input.name.toString();
+ }
+ };
+
+ PCollection<Person> sortedByName = unsorted.by(nameExtractor, strings()).groupByKey().ungroup().values();
+
+ List<Person> sortedByNameList = Lists.newArrayList(sortedByName.materialize());
+
+ assertEquals(3, sortedByNameList.size());
+ assertEquals(aaa30, sortedByNameList.get(0));
+ assertEquals(bbb20, sortedByNameList.get(1));
+ assertEquals(ccc10, sortedByNameList.get(2));
+
+ // Sort by Age
+
+ MapFn<Person, Integer> ageExtractor = new MapFn<Person, Integer>() {
+
+ @Override
+ public Integer map(Person input) {
+ return input.age;
+ }
+ };
+
+ PCollection<Person> sortedByAge = unsorted.by(ageExtractor, ints()).groupByKey().ungroup().values();
+
+ List<Person> sortedByAgeList = Lists.newArrayList(sortedByAge.materialize());
+
+ assertEquals(3, sortedByAgeList.size());
+ assertEquals(ccc10, sortedByAgeList.get(0));
+ assertEquals(bbb20, sortedByAgeList.get(1));
+ assertEquals(aaa30, sortedByAgeList.get(2));
+
+ pipeline.done();
+ }
+
+ private void writeAvroFile(List<Person> people, File avroFile) throws IOException {
+
+ FileOutputStream outputStream = new FileOutputStream(avroFile);
+ SpecificDatumWriter<Person> writer = new SpecificDatumWriter<Person>(Person.class);
+
+ DataFileWriter<Person> dataFileWriter = new DataFileWriter<Person>(writer);
+ dataFileWriter.create(Person.SCHEMA$, outputStream);
+ for (Person person : people) {
+ dataFileWriter.append(person);
+ }
+ dataFileWriter.close();
+ outputStream.close();
+ }
+
+ private Person createPerson(String name, int age) throws IOException {
+
+ Person person = new Person();
+ person.age = age;
+ person.name = name;
+ List<CharSequence> siblingNames = Lists.newArrayList();
+ person.siblingnames = siblingNames;
+
+ return person;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/CogroupIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/CogroupIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/CogroupIT.java
new file mode 100644
index 0000000..4b28da7
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/CogroupIT.java
@@ -0,0 +1,112 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.assertThat;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.test.Tests;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+
+
+public class CogroupIT {
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+ private MRPipeline pipeline;
+ private PCollection<String> lines1;
+ private PCollection<String> lines2;
+
+
+ @Before
+ public void setUp() throws IOException {
+ pipeline = new MRPipeline(CogroupIT.class, tmpDir.getDefaultConfiguration());
+ lines1 = pipeline.readTextFile(tmpDir.copyResourceFileName(Tests.resource(this, "src1.txt")));
+ lines2 = pipeline.readTextFile(tmpDir.copyResourceFileName(Tests.resource(this, "src2.txt")));
+ }
+
+ @After
+ public void tearDown() {
+ pipeline.done();
+ }
+
+ @Test
+ public void testCogroupWritables() {
+ runCogroup(WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testCogroupAvro() {
+ runCogroup(AvroTypeFamily.getInstance());
+ }
+
+ public void runCogroup(PTypeFamily ptf) {
+ PTableType<String, String> tt = ptf.tableOf(ptf.strings(), ptf.strings());
+
+ PTable<String, String> kv1 = lines1.parallelDo("kv1", new KeyValueSplit(), tt);
+ PTable<String, String> kv2 = lines2.parallelDo("kv2", new KeyValueSplit(), tt);
+
+ PTable<String, Pair<Collection<String>, Collection<String>>> cg = Cogroup.cogroup(kv1, kv2);
+
+ Map<String, Pair<Collection<String>, Collection<String>>> actual = cg.materializeToMap();
+
+ Map<String, Pair<Collection<String>, Collection<String>>> expected = ImmutableMap.of(
+ "a", Pair.of(coll("1-1", "1-4"), coll()),
+ "b", Pair.of(coll("1-2"), coll("2-1")),
+ "c", Pair.of(coll("1-3"), coll("2-2", "2-3")),
+ "d", Pair.of(coll(), coll("2-4"))
+ );
+
+ assertThat(actual, is(expected));
+ }
+
+
+ private static class KeyValueSplit extends DoFn<String, Pair<String, String>> {
+ @Override
+ public void process(String input, Emitter<Pair<String, String>> emitter) {
+ String[] fields = input.split(",");
+ emitter.emit(Pair.of(fields[0], fields[1]));
+ }
+ }
+
+ private static Collection<String> coll(String... values) {
+ return ImmutableList.copyOf(values);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/SecondarySortIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/SecondarySortIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/SecondarySortIT.java
new file mode 100644
index 0000000..242f621
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/SecondarySortIT.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static org.apache.crunch.types.avro.Avros.*;
+import static org.junit.Assert.assertEquals;
+
+import java.io.Serializable;
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.From;
+import org.apache.crunch.test.CrunchTestSupport;
+import org.junit.Test;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.ImmutableList;
+
+
+public class SecondarySortIT extends CrunchTestSupport implements Serializable {
+
+ @Test
+ public void testSecondarySort() throws Exception {
+ Pipeline p = new MRPipeline(SecondarySortIT.class, tempDir.getDefaultConfiguration());
+ String inputFile = tempDir.copyResourceFileName("secondary_sort_input.txt");
+
+ PTable<String, Pair<Integer, Integer>> in = p.read(From.textFile(inputFile))
+ .parallelDo(new MapFn<String, Pair<String, Pair<Integer, Integer>>>() {
+ @Override
+ public Pair<String, Pair<Integer, Integer>> map(String input) {
+ String[] pieces = input.split(",");
+ return Pair.of(pieces[0],
+ Pair.of(Integer.valueOf(pieces[1].trim()), Integer.valueOf(pieces[2].trim())));
+ }
+ }, tableOf(strings(), pairs(ints(), ints())));
+ Iterable<String> lines = SecondarySort.sortAndApply(in, new MapFn<Pair<String, Iterable<Pair<Integer, Integer>>>, String>() {
+ @Override
+ public String map(Pair<String, Iterable<Pair<Integer, Integer>>> input) {
+ Joiner j = Joiner.on(',');
+ return j.join(input.first(), j.join(input.second()));
+ }
+ }, strings()).materialize();
+ assertEquals(ImmutableList.of("one,[-5,10],[1,1],[2,-3]", "three,[0,-1]", "two,[1,7],[2,6],[4,5]"),
+ ImmutableList.copyOf(lines));
+ p.done();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/SetIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/SetIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/SetIT.java
new file mode 100644
index 0000000..d1300d2
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/SetIT.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+
+import org.apache.crunch.PCollection;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.At;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import com.google.common.collect.Lists;
+
+@RunWith(value = Parameterized.class)
+public class SetIT {
+
+ private PTypeFamily typeFamily;
+
+ private Pipeline pipeline;
+ private PCollection<String> set1;
+ private PCollection<String> set2;
+
+ public SetIT(PTypeFamily typeFamily) {
+ this.typeFamily = typeFamily;
+ }
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Parameters
+ public static Collection<Object[]> data() {
+ Object[][] data = new Object[][] { { WritableTypeFamily.getInstance() }, { AvroTypeFamily.getInstance() } };
+ return Arrays.asList(data);
+ }
+
+ @Before
+ public void setUp() throws IOException {
+ String set1InputPath = tmpDir.copyResourceFileName("set1.txt");
+ String set2InputPath = tmpDir.copyResourceFileName("set2.txt");
+ pipeline = new MRPipeline(SetIT.class, tmpDir.getDefaultConfiguration());
+ set1 = pipeline.read(At.textFile(set1InputPath, typeFamily.strings()));
+ set2 = pipeline.read(At.textFile(set2InputPath, typeFamily.strings()));
+ }
+
+ @After
+ public void tearDown() {
+ pipeline.done();
+ }
+
+ @Test
+ public void testDifference() throws Exception {
+ PCollection<String> difference = Set.difference(set1, set2);
+ assertEquals(Lists.newArrayList("b", "e"), Lists.newArrayList(difference.materialize()));
+ }
+
+ @Test
+ public void testIntersection() throws Exception {
+ PCollection<String> intersection = Set.intersection(set1, set2);
+ assertEquals(Lists.newArrayList("a", "c"), Lists.newArrayList(intersection.materialize()));
+ }
+
+ @Test
+ public void testComm() throws Exception {
+ PCollection<Tuple3<String, String, String>> comm = Set.comm(set1, set2);
+ Iterator<Tuple3<String, String, String>> i = comm.materialize().iterator();
+ checkEquals(null, null, "a", i.next());
+ checkEquals("b", null, null, i.next());
+ checkEquals(null, null, "c", i.next());
+ checkEquals(null, "d", null, i.next());
+ checkEquals("e", null, null, i.next());
+ assertFalse(i.hasNext());
+ }
+
+ private void checkEquals(String s1, String s2, String s3, Tuple3<String, String, String> tuple) {
+ assertEquals("first string", s1, tuple.first());
+ assertEquals("second string", s2, tuple.second());
+ assertEquals("third string", s3, tuple.third());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/SortByValueIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/SortByValueIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/SortByValueIT.java
new file mode 100644
index 0000000..e19c7d3
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/SortByValueIT.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.From;
+import org.apache.crunch.lib.Sort.ColumnOrder;
+import org.apache.crunch.lib.Sort.Order;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableList;
+
+/**
+ *
+ */
+public class SortByValueIT {
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+ private static class SplitFn extends MapFn<String, Pair<String, Long>> {
+ private String sep;
+
+ public SplitFn(String sep) {
+ this.sep = sep;
+ }
+
+ @Override
+ public Pair<String, Long> map(String input) {
+ String[] pieces = input.split(sep);
+ return Pair.of(pieces[0], Long.valueOf(pieces[1]));
+ }
+ }
+
+ @Test
+ public void testSortByValueWritables() throws Exception {
+ run(new MRPipeline(SortByValueIT.class), WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testSortByValueAvro() throws Exception {
+ run(new MRPipeline(SortByValueIT.class), AvroTypeFamily.getInstance());
+ }
+
+ public void run(Pipeline pipeline, PTypeFamily ptf) throws Exception {
+ String sbv = tmpDir.copyResourceFileName("sort_by_value.txt");
+ PTable<String, Long> letterCounts = pipeline.read(From.textFile(sbv)).parallelDo(new SplitFn("\t"),
+ ptf.tableOf(ptf.strings(), ptf.longs()));
+ PCollection<Pair<String, Long>> sorted = Sort.sortPairs(
+ letterCounts,
+ new ColumnOrder(2, Order.DESCENDING),
+ new ColumnOrder(1, Order.ASCENDING));
+ assertEquals(
+ ImmutableList.of(Pair.of("C", 3L), Pair.of("A", 2L), Pair.of("D", 2L), Pair.of("B", 1L), Pair.of("E", 1L)),
+ ImmutableList.copyOf(sorted.materialize()));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/SortIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/SortIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/SortIT.java
new file mode 100644
index 0000000..bad4864
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/SortIT.java
@@ -0,0 +1,327 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static org.apache.crunch.lib.Sort.ColumnOrder.by;
+import static org.apache.crunch.lib.Sort.Order.ASCENDING;
+import static org.apache.crunch.lib.Sort.Order.DESCENDING;
+import static org.apache.crunch.test.StringWrapper.wrap;
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.lib.Sort.ColumnOrder;
+import org.apache.crunch.lib.Sort.Order;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class SortIT implements Serializable {
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testWritableSortAsc() throws Exception {
+ runSingle(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), Order.ASCENDING,
+ "A\tand this text as well");
+ }
+
+ @Test
+ public void testWritableSortDesc() throws Exception {
+ runSingle(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), Order.DESCENDING,
+ "B\tthis doc has some text");
+ }
+
+ @Test
+ public void testWritableSortAscDesc() throws Exception {
+ runPair(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING), "A",
+ "this doc has this text");
+ }
+
+ @Test
+ public void testWritableSortSecondDescFirstAsc() throws Exception {
+ runPair(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), by(2, DESCENDING), by(1, ASCENDING), "A",
+ "this doc has this text");
+ }
+
+ @Test
+ public void testWritableSortTripleAscDescAsc() throws Exception {
+ runTriple(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING),
+ by(3, ASCENDING), "A", "this", "doc");
+ }
+
+ @Test
+ public void testWritableSortQuadAscDescAscDesc() throws Exception {
+ runQuad(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING),
+ by(3, ASCENDING), by(4, DESCENDING), "A", "this", "doc", "has");
+ }
+
+ @Test
+ public void testWritableSortTupleNAscDesc() throws Exception {
+ runTupleN(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(),
+ new ColumnOrder[] { by(1, ASCENDING), by(2, DESCENDING) }, new String[] { "A", "this doc has this text" });
+ }
+
+ @Test
+ public void testWritableSortTable() throws Exception {
+ runTable(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), "A");
+ }
+
+ @Test
+ public void testAvroSortAsc() throws Exception {
+ runSingle(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(), Order.ASCENDING, "A\tand this text as well");
+ }
+
+ @Test
+ public void testAvroSortDesc() throws Exception {
+ runSingle(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(), Order.DESCENDING, "B\tthis doc has some text");
+ }
+
+ @Test
+ public void testAvroSortPairAscDesc() throws Exception {
+ runPair(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING), "A",
+ "this doc has this text");
+ }
+
+ @Test
+ public void testAvroSortPairSecondDescFirstAsc() throws Exception {
+ runPair(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(), by(2, DESCENDING), by(1, ASCENDING), "A",
+ "this doc has this text");
+ }
+
+ @Test
+ public void testAvroSortTripleAscDescAsc() throws Exception {
+ runTriple(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING),
+ by(3, ASCENDING), "A", "this", "doc");
+ }
+
+ @Test
+ public void testAvroSortQuadAscDescAscDesc() throws Exception {
+ runQuad(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING),
+ by(3, ASCENDING), by(4, DESCENDING), "A", "this", "doc", "has");
+ }
+
+ @Test
+ public void testAvroSortTupleNAscDesc() throws Exception {
+ runTupleN(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(),
+ new ColumnOrder[] { by(1, ASCENDING), by(2, DESCENDING) }, new String[] { "A", "this doc has this text" });
+ }
+
+ @Test
+ public void testAvroReflectSortPair() throws IOException {
+ Pipeline pipeline = new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration());
+ pipeline.enableDebug();
+ String rsrc = tmpDir.copyResourceFileName("set2.txt");
+ PCollection<Pair<String, StringWrapper>> in = pipeline.readTextFile(rsrc)
+ .parallelDo(new MapFn<String, Pair<String, StringWrapper>>() {
+
+ @Override
+ public Pair<String, StringWrapper> map(String input) {
+ return Pair.of(input, wrap(input));
+ }
+ }, Avros.pairs(Avros.strings(), Avros.reflects(StringWrapper.class)));
+ PCollection<Pair<String, StringWrapper>> sorted = Sort.sort(in, Order.ASCENDING);
+
+ List<Pair<String, StringWrapper>> expected = Lists.newArrayList();
+ expected.add(Pair.of("a", wrap("a")));
+ expected.add(Pair.of("c", wrap("c")));
+ expected.add(Pair.of("d", wrap("d")));
+
+ assertEquals(expected, Lists.newArrayList(sorted.materialize()));
+ }
+
+ @Test
+ public void testAvroReflectSortTable() throws IOException {
+ Pipeline pipeline = new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration());
+ PTable<String, StringWrapper> unsorted = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt")).parallelDo(
+ new MapFn<String, Pair<String, StringWrapper>>() {
+
+ @Override
+ public Pair<String, StringWrapper> map(String input) {
+ return Pair.of(input, wrap(input));
+ }
+ }, Avros.tableOf(Avros.strings(), Avros.reflects(StringWrapper.class)));
+
+ PTable<String, StringWrapper> sorted = Sort.sort(unsorted);
+
+ List<Pair<String, StringWrapper>> expected = Lists.newArrayList();
+ expected.add(Pair.of("a", wrap("a")));
+ expected.add(Pair.of("c", wrap("c")));
+ expected.add(Pair.of("d", wrap("d")));
+
+ assertEquals(expected, Lists.newArrayList(sorted.materialize()));
+ }
+
+ @Test
+ public void testAvroSortTable() throws Exception {
+ runTable(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(), "A");
+ }
+
+ private void runSingle(Pipeline pipeline, PTypeFamily typeFamily, Order order, String firstLine) throws IOException {
+ String inputPath = tmpDir.copyResourceFileName("docs.txt");
+
+ PCollection<String> input = pipeline.readTextFile(inputPath);
+ // following turns the input from Writables to required type family
+ PCollection<String> input2 = input.parallelDo(new DoFn<String, String>() {
+ @Override
+ public void process(String input, Emitter<String> emitter) {
+ emitter.emit(input);
+ }
+ }, typeFamily.strings());
+ PCollection<String> sorted = Sort.sort(input2, order);
+ Iterable<String> lines = sorted.materialize();
+
+ assertEquals(firstLine, lines.iterator().next());
+ pipeline.done(); // TODO: finally
+ }
+
+ private void runPair(Pipeline pipeline, PTypeFamily typeFamily, ColumnOrder first, ColumnOrder second,
+ String firstField, String secondField) throws IOException {
+ String inputPath = tmpDir.copyResourceFileName("docs.txt");
+
+ PCollection<String> input = pipeline.readTextFile(inputPath);
+ PTable<String, String> kv = input.parallelDo(new DoFn<String, Pair<String, String>>() {
+ @Override
+ public void process(String input, Emitter<Pair<String, String>> emitter) {
+ String[] split = input.split("[\t]+");
+ emitter.emit(Pair.of(split[0], split[1]));
+ }
+ }, typeFamily.tableOf(typeFamily.strings(), typeFamily.strings()));
+ PCollection<Pair<String, String>> sorted = Sort.sortPairs(kv, first, second);
+ List<Pair<String, String>> lines = Lists.newArrayList(sorted.materialize());
+ Pair<String, String> l = lines.iterator().next();
+ assertEquals(firstField, l.first());
+ assertEquals(secondField, l.second());
+ pipeline.done();
+ }
+
+ private void runTriple(Pipeline pipeline, PTypeFamily typeFamily, ColumnOrder first, ColumnOrder second,
+ ColumnOrder third, String firstField, String secondField, String thirdField) throws IOException {
+ String inputPath = tmpDir.copyResourceFileName("docs.txt");
+
+ PCollection<String> input = pipeline.readTextFile(inputPath);
+ PCollection<Tuple3<String, String, String>> kv = input.parallelDo(
+ new DoFn<String, Tuple3<String, String, String>>() {
+ @Override
+ public void process(String input, Emitter<Tuple3<String, String, String>> emitter) {
+ String[] split = input.split("[\t ]+");
+ int len = split.length;
+ emitter.emit(Tuple3.of(split[0], split[1 % len], split[2 % len]));
+ }
+ }, typeFamily.triples(typeFamily.strings(), typeFamily.strings(), typeFamily.strings()));
+ PCollection<Tuple3<String, String, String>> sorted = Sort.sortTriples(kv, first, second, third);
+ List<Tuple3<String, String, String>> lines = Lists.newArrayList(sorted.materialize());
+ Tuple3<String, String, String> l = lines.iterator().next();
+ assertEquals(firstField, l.first());
+ assertEquals(secondField, l.second());
+ assertEquals(thirdField, l.third());
+ pipeline.done();
+ }
+
+ private void runQuad(Pipeline pipeline, PTypeFamily typeFamily, ColumnOrder first, ColumnOrder second,
+ ColumnOrder third, ColumnOrder fourth, String firstField, String secondField, String thirdField,
+ String fourthField) throws IOException {
+ String inputPath = tmpDir.copyResourceFileName("docs.txt");
+
+ PCollection<String> input = pipeline.readTextFile(inputPath);
+ PCollection<Tuple4<String, String, String, String>> kv = input.parallelDo(
+ new DoFn<String, Tuple4<String, String, String, String>>() {
+ @Override
+ public void process(String input, Emitter<Tuple4<String, String, String, String>> emitter) {
+ String[] split = input.split("[\t ]+");
+ int len = split.length;
+ emitter.emit(Tuple4.of(split[0], split[1 % len], split[2 % len], split[3 % len]));
+ }
+ }, typeFamily.quads(typeFamily.strings(), typeFamily.strings(), typeFamily.strings(), typeFamily.strings()));
+ PCollection<Tuple4<String, String, String, String>> sorted = Sort.sortQuads(kv, first, second, third, fourth);
+ Iterable<Tuple4<String, String, String, String>> lines = sorted.materialize();
+ Tuple4<String, String, String, String> l = lines.iterator().next();
+ assertEquals(firstField, l.first());
+ assertEquals(secondField, l.second());
+ assertEquals(thirdField, l.third());
+ assertEquals(fourthField, l.fourth());
+ pipeline.done();
+ }
+
+ private void runTupleN(Pipeline pipeline, PTypeFamily typeFamily, ColumnOrder[] orders, String[] fields)
+ throws IOException {
+ String inputPath = tmpDir.copyResourceFileName("docs.txt");
+
+ PCollection<String> input = pipeline.readTextFile(inputPath);
+ PType[] types = new PType[orders.length];
+ Arrays.fill(types, typeFamily.strings());
+ PCollection<TupleN> kv = input.parallelDo(new DoFn<String, TupleN>() {
+ @Override
+ public void process(String input, Emitter<TupleN> emitter) {
+ String[] split = input.split("[\t]+");
+ emitter.emit(new TupleN(split));
+ }
+ }, typeFamily.tuples(types));
+ PCollection<TupleN> sorted = Sort.sortTuples(kv, orders);
+ Iterable<TupleN> lines = sorted.materialize();
+ TupleN l = lines.iterator().next();
+ int i = 0;
+ for (String field : fields) {
+ assertEquals(field, l.get(i++));
+ }
+ pipeline.done();
+ }
+
+ private void runTable(Pipeline pipeline, PTypeFamily typeFamily, String firstKey) throws IOException {
+ String inputPath = tmpDir.copyResourceFileName("docs.txt");
+
+ PCollection<String> input = pipeline.readTextFile(inputPath);
+ PTable<String, String> table = input.parallelDo(new DoFn<String, Pair<String, String>>() {
+ @Override
+ public void process(String input, Emitter<Pair<String, String>> emitter) {
+ String[] split = input.split("[\t]+");
+ emitter.emit(Pair.of(split[0], split[1]));
+ }
+ }, typeFamily.tableOf(typeFamily.strings(), typeFamily.strings()));
+
+ PTable<String, String> sorted = Sort.sort(table);
+ Iterable<Pair<String, String>> lines = sorted.materialize();
+ Pair<String, String> l = lines.iterator().next();
+ assertEquals(firstKey, l.first());
+ pipeline.done();
+ }
+
+}
[20/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/io/avro/AvroFileSourceTargetIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/io/avro/AvroFileSourceTargetIT.java b/crunch/src/it/java/org/apache/crunch/io/avro/AvroFileSourceTargetIT.java
deleted file mode 100644
index 671b920..0000000
--- a/crunch/src/it/java/org/apache/crunch/io/avro/AvroFileSourceTargetIT.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.avro;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.List;
-
-import org.apache.avro.Schema;
-import org.apache.avro.file.DataFileWriter;
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.generic.GenericData.Record;
-import org.apache.avro.generic.GenericDatumWriter;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.avro.reflect.ReflectData;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.At;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.avro.Avros;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-@SuppressWarnings("serial")
-public class AvroFileSourceTargetIT implements Serializable {
-
- private transient File avroFile;
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Before
- public void setUp() throws IOException {
- avroFile = tmpDir.getFile("test.avro");
- }
-
- private void populateGenericFile(List<GenericRecord> genericRecords, Schema schema) throws IOException {
- FileOutputStream outputStream = new FileOutputStream(this.avroFile);
- GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<GenericRecord>(schema);
-
- DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(genericDatumWriter);
- dataFileWriter.create(schema, outputStream);
-
- for (GenericRecord record : genericRecords) {
- dataFileWriter.append(record);
- }
-
- dataFileWriter.close();
- outputStream.close();
-
- }
-
- @Test
- public void testSpecific() throws IOException {
- GenericRecord savedRecord = new GenericData.Record(Person.SCHEMA$);
- savedRecord.put("name", "John Doe");
- savedRecord.put("age", 42);
- savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
- populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);
-
- Pipeline pipeline = new MRPipeline(AvroFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
- PCollection<Person> genericCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
- Avros.records(Person.class)));
-
- List<Person> personList = Lists.newArrayList(genericCollection.materialize());
-
- Person expectedPerson = new Person();
- expectedPerson.name = "John Doe";
- expectedPerson.age = 42;
-
- List<CharSequence> siblingNames = Lists.newArrayList();
- siblingNames.add("Jimmy");
- siblingNames.add("Jane");
- expectedPerson.siblingnames = siblingNames;
-
- assertEquals(Lists.newArrayList(expectedPerson), Lists.newArrayList(personList));
- }
-
- @Test
- public void testGeneric() throws IOException {
- String genericSchemaJson = Person.SCHEMA$.toString().replace("Person", "GenericPerson");
- Schema genericPersonSchema = new Schema.Parser().parse(genericSchemaJson);
- GenericRecord savedRecord = new GenericData.Record(genericPersonSchema);
- savedRecord.put("name", "John Doe");
- savedRecord.put("age", 42);
- savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
- populateGenericFile(Lists.newArrayList(savedRecord), genericPersonSchema);
-
- Pipeline pipeline = new MRPipeline(AvroFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
- PCollection<Record> genericCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
- Avros.generics(genericPersonSchema)));
-
- List<Record> recordList = Lists.newArrayList(genericCollection.materialize());
-
- assertEquals(Lists.newArrayList(savedRecord), Lists.newArrayList(recordList));
- }
-
- @Test
- public void testReflect() throws IOException {
- Schema pojoPersonSchema = ReflectData.get().getSchema(StringWrapper.class);
- GenericRecord savedRecord = new GenericData.Record(pojoPersonSchema);
- savedRecord.put("value", "stringvalue");
- populateGenericFile(Lists.newArrayList(savedRecord), pojoPersonSchema);
-
- Pipeline pipeline = new MRPipeline(AvroFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
- PCollection<StringWrapper> stringValueCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
- Avros.reflects(StringWrapper.class)));
-
- List<StringWrapper> recordList = Lists.newArrayList(stringValueCollection.materialize());
-
- assertEquals(1, recordList.size());
- StringWrapper stringWrapper = recordList.get(0);
- assertEquals("stringvalue", stringWrapper.getValue());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/io/avro/AvroPipelineIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/io/avro/AvroPipelineIT.java b/crunch/src/it/java/org/apache/crunch/io/avro/AvroPipelineIT.java
deleted file mode 100644
index 29bf4f5..0000000
--- a/crunch/src/it/java/org/apache/crunch/io/avro/AvroPipelineIT.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with this
- * work for additional information regarding copyright ownership. The ASF
- * licenses this file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
- */
-package org.apache.crunch.io.avro;
-
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.List;
-
-import org.apache.avro.Schema;
-import org.apache.avro.file.DataFileWriter;
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.generic.GenericDatumWriter;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.commons.io.FileUtils;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.Target;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.At;
-import org.apache.crunch.io.To;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.avro.Avros;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class AvroPipelineIT implements Serializable {
-
- private transient File avroFile;
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Before
- public void setUp() throws IOException {
- avroFile = tmpDir.getFile("test.avro");
- }
-
- private void populateGenericFile(List<GenericRecord> genericRecords, Schema schema) throws IOException {
- FileOutputStream outputStream = new FileOutputStream(this.avroFile);
- GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<GenericRecord>(schema);
-
- DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(genericDatumWriter);
- dataFileWriter.create(schema, outputStream);
-
- for (GenericRecord record : genericRecords) {
- dataFileWriter.append(record);
- }
-
- dataFileWriter.close();
- outputStream.close();
-
- }
-
- @Test
- public void toTextShouldWriteAvroDataAsDatumText() throws Exception {
- GenericRecord savedRecord = new GenericData.Record(Person.SCHEMA$);
- savedRecord.put("name", "John Doe");
- savedRecord.put("age", 42);
- savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
- populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);
-
- Pipeline pipeline = new MRPipeline(AvroFileSourceTargetIT.class, tmpDir.getDefaultConfiguration());
- PCollection<Person> genericCollection = pipeline.read(At.avroFile(avroFile.getAbsolutePath(),
- Avros.records(Person.class)));
- File outputFile = tmpDir.getFile("output");
- Target textFile = To.textFile(outputFile.getAbsolutePath());
- pipeline.write(genericCollection, textFile);
- pipeline.run();
- Person person = genericCollection.materialize().iterator().next();
- String outputString = FileUtils.readFileToString(new File(outputFile, "part-m-00000"));
- assertTrue(outputString.contains(person.toString()));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/io/avro/AvroReflectIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/io/avro/AvroReflectIT.java b/crunch/src/it/java/org/apache/crunch/io/avro/AvroReflectIT.java
deleted file mode 100644
index 7a90517..0000000
--- a/crunch/src/it/java/org/apache/crunch/io/avro/AvroReflectIT.java
+++ /dev/null
@@ -1,109 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.avro;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.lib.Aggregate;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.avro.Avros;
-import org.junit.Assume;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class AvroReflectIT implements Serializable {
-
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testReflection() throws IOException {
- Pipeline pipeline = new MRPipeline(AvroReflectIT.class, tmpDir.getDefaultConfiguration());
- PCollection<StringWrapper> stringWrapperCollection = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt"))
- .parallelDo(new MapFn<String, StringWrapper>() {
-
- @Override
- public StringWrapper map(String input) {
- StringWrapper stringWrapper = new StringWrapper();
- stringWrapper.setValue(input);
- return stringWrapper;
- }
- }, Avros.reflects(StringWrapper.class));
-
- List<StringWrapper> stringWrappers = Lists.newArrayList(stringWrapperCollection.materialize());
-
- pipeline.done();
-
- assertEquals(Lists.newArrayList(new StringWrapper("b"), new StringWrapper("c"), new StringWrapper("a"),
- new StringWrapper("e")), stringWrappers);
-
- }
-
- // Verify that running with a combination of reflect and specific schema
- // doesn't crash
- @Test
- public void testCombinationOfReflectionAndSpecific() throws IOException {
- Assume.assumeTrue(Avros.CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS);
- Pipeline pipeline = new MRPipeline(AvroReflectIT.class, tmpDir.getDefaultConfiguration());
- PCollection<Pair<StringWrapper, Person>> hybridPairCollection = pipeline.readTextFile(
- tmpDir.copyResourceFileName("set1.txt")).parallelDo(new MapFn<String, Pair<StringWrapper, Person>>() {
-
- @Override
- public Pair<StringWrapper, Person> map(String input) {
- Person person = new Person();
- person.name = input;
- person.age = 42;
- person.siblingnames = Lists.<CharSequence> newArrayList(input);
-
- return Pair.of(new StringWrapper(input), person);
- }
- }, Avros.pairs(Avros.reflects(StringWrapper.class), Avros.records(Person.class)));
-
- PCollection<Pair<String, Long>> countCollection = Aggregate.count(hybridPairCollection).parallelDo(
- new MapFn<Pair<Pair<StringWrapper, Person>, Long>, Pair<String, Long>>() {
-
- @Override
- public Pair<String, Long> map(Pair<Pair<StringWrapper, Person>, Long> input) {
- return Pair.of(input.first().first().getValue(), input.second());
- }
- }, Avros.pairs(Avros.strings(), Avros.longs()));
-
- List<Pair<String, Long>> materialized = Lists.newArrayList(countCollection.materialize());
- List<Pair<String, Long>> expected = Lists.newArrayList(Pair.of("a", 1L), Pair.of("b", 1L), Pair.of("c", 1L),
- Pair.of("e", 1L));
- Collections.sort(materialized);
-
- assertEquals(expected, materialized);
- pipeline.done();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/io/avro/AvroWritableIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/io/avro/AvroWritableIT.java b/crunch/src/it/java/org/apache/crunch/io/avro/AvroWritableIT.java
deleted file mode 100644
index cbb7fde..0000000
--- a/crunch/src/it/java/org/apache/crunch/io/avro/AvroWritableIT.java
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.avro;
-
-import static org.apache.crunch.types.avro.Avros.ints;
-import static org.apache.crunch.types.avro.Avros.tableOf;
-import static org.apache.crunch.types.avro.Avros.writables;
-import static org.junit.Assert.assertEquals;
-
-import java.io.Serializable;
-import java.util.Map;
-
-import org.apache.crunch.CombineFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.hadoop.io.DoubleWritable;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Maps;
-
-/**
- * Verify handling of both a ByteBuffer and byte array as input from an Avro job (depending
- * on the version of Avro being used).
- */
-public class AvroWritableIT implements Serializable {
-
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testAvroBasedWritablePipeline() throws Exception {
- String customersInputPath = tmpDir.copyResourceFileName("customers.txt");
- Pipeline pipeline = new MRPipeline(AvroWritableIT.class, tmpDir.getDefaultConfiguration());
- pipeline.enableDebug();
- PCollection<String> customerLines = pipeline.readTextFile(customersInputPath);
- Map<Integer, DoubleWritable> outputMap = customerLines.parallelDo(
- new MapFn<String, Pair<Integer, DoubleWritable>>() {
- @Override
- public Pair<Integer, DoubleWritable> map(String input) {
- int len = input.length();
- return Pair.of(len, new DoubleWritable(len));
- }
- }, tableOf(ints(), writables(DoubleWritable.class)))
- .groupByKey()
- .combineValues(new CombineFn<Integer, DoubleWritable>() {
- @Override
- public void process(Pair<Integer, Iterable<DoubleWritable>> input,
- Emitter<Pair<Integer, DoubleWritable>> emitter) {
- double sum = 0.0;
- for (DoubleWritable dw : input.second()) {
- sum += dw.get();
- }
- emitter.emit(Pair.of(input.first(), new DoubleWritable(sum)));
- }
- })
- .materializeToMap();
-
- Map<Integer, DoubleWritable> expectedMap = Maps.newHashMap();
- expectedMap.put(17, new DoubleWritable(17.0));
- expectedMap.put(16, new DoubleWritable(16.0));
- expectedMap.put(12, new DoubleWritable(24.0));
-
- assertEquals(expectedMap, outputMap);
-
- pipeline.done();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/AggregateIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/AggregateIT.java b/crunch/src/it/java/org/apache/crunch/lib/AggregateIT.java
deleted file mode 100644
index 56ee3ac..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/AggregateIT.java
+++ /dev/null
@@ -1,231 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static org.apache.crunch.types.writable.Writables.strings;
-import static org.apache.crunch.types.writable.Writables.tableOf;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Map;
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.Employee;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.apache.crunch.types.writable.Writables;
-import org.apache.hadoop.io.Text;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-
-public class AggregateIT {
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testWritables() throws Exception {
- Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
- String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
- PCollection<String> shakes = pipeline.readTextFile(shakesInputPath);
- runMinMax(shakes, WritableTypeFamily.getInstance());
- pipeline.done();
- }
-
- @Test
- public void testAvro() throws Exception {
- Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
- String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
- PCollection<String> shakes = pipeline.readTextFile(shakesInputPath);
- runMinMax(shakes, AvroTypeFamily.getInstance());
- pipeline.done();
- }
-
- @Test
- public void testInMemoryAvro() throws Exception {
- PCollection<String> someText = MemPipeline.collectionOf("first line", "second line", "third line");
- runMinMax(someText, AvroTypeFamily.getInstance());
- }
-
- public static void runMinMax(PCollection<String> shakes, PTypeFamily family) throws Exception {
- PCollection<Integer> lengths = shakes.parallelDo(new MapFn<String, Integer>() {
- @Override
- public Integer map(String input) {
- return input.length();
- }
- }, family.ints());
- PCollection<Integer> negLengths = lengths.parallelDo(new MapFn<Integer, Integer>() {
- @Override
- public Integer map(Integer input) {
- return -input;
- }
- }, family.ints());
- Integer maxLengths = Aggregate.max(lengths).getValue();
- Integer minLengths = Aggregate.min(negLengths).getValue();
- assertTrue(maxLengths != null);
- assertTrue(minLengths != null);
- assertEquals(maxLengths.intValue(), -minLengths.intValue());
- }
-
- private static class SplitFn extends MapFn<String, Pair<String, String>> {
- @Override
- public Pair<String, String> map(String input) {
- String[] p = input.split("\\s+");
- return Pair.of(p[0], p[1]);
- }
- }
-
- @Test
- public void testCollectUrls() throws Exception {
- Pipeline p = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
- String urlsInputPath = tmpDir.copyResourceFileName("urls.txt");
- PTable<String, Collection<String>> urls = Aggregate.collectValues(p.readTextFile(urlsInputPath).parallelDo(
- new SplitFn(), tableOf(strings(), strings())));
- for (Pair<String, Collection<String>> e : urls.materialize()) {
- String key = e.first();
- int expectedSize = 0;
- if ("www.A.com".equals(key)) {
- expectedSize = 4;
- } else if ("www.B.com".equals(key) || "www.F.com".equals(key)) {
- expectedSize = 2;
- } else if ("www.C.com".equals(key) || "www.D.com".equals(key) || "www.E.com".equals(key)) {
- expectedSize = 1;
- }
- assertEquals("Checking key = " + key, expectedSize, e.second().size());
- p.done();
- }
- }
-
- @Test
- public void testTopN() throws Exception {
- PTableType<String, Integer> ptype = Avros.tableOf(Avros.strings(), Avros.ints());
- PTable<String, Integer> counts = MemPipeline.typedTableOf(ptype, "foo", 12, "bar", 17, "baz", 29);
-
- PTable<String, Integer> top2 = Aggregate.top(counts, 2, true);
- assertEquals(ImmutableList.of(Pair.of("baz", 29), Pair.of("bar", 17)), top2.materialize());
-
- PTable<String, Integer> bottom2 = Aggregate.top(counts, 2, false);
- assertEquals(ImmutableList.of(Pair.of("foo", 12), Pair.of("bar", 17)), bottom2.materialize());
- }
-
- @Test
- public void testCollectValues_Writables() throws IOException {
- Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
- Map<Integer, Collection<Text>> collectionMap = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt"))
- .parallelDo(new MapStringToTextPair(), Writables.tableOf(Writables.ints(), Writables.writables(Text.class)))
- .collectValues().materializeToMap();
-
- assertEquals(1, collectionMap.size());
-
- assertTrue(collectionMap.get(1).containsAll(Lists.newArrayList(new Text("c"), new Text("d"), new Text("a"))));
- }
-
- @Test
- public void testCollectValues_Avro() throws IOException {
-
- MapStringToEmployeePair mapFn = new MapStringToEmployeePair();
- Pipeline pipeline = new MRPipeline(AggregateIT.class, tmpDir.getDefaultConfiguration());
- Map<Integer, Collection<Employee>> collectionMap = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt"))
- .parallelDo(mapFn, Avros.tableOf(Avros.ints(), Avros.records(Employee.class))).collectValues()
- .materializeToMap();
-
- assertEquals(1, collectionMap.size());
-
- Employee empC = mapFn.map("c").second();
- Employee empD = mapFn.map("d").second();
- Employee empA = mapFn.map("a").second();
-
- assertTrue(collectionMap.get(1).containsAll(Lists.newArrayList(empC, empD, empA)));
- }
-
- private static class MapStringToTextPair extends MapFn<String, Pair<Integer, Text>> {
- @Override
- public Pair<Integer, Text> map(String input) {
- return Pair.of(1, new Text(input));
- }
- }
-
- private static class MapStringToEmployeePair extends MapFn<String, Pair<Integer, Employee>> {
- @Override
- public Pair<Integer, Employee> map(String input) {
- Employee emp = new Employee();
- emp.name = input;
- emp.salary = 0;
- emp.department = "";
- return Pair.of(1, emp);
- }
- }
-
- public static class PojoText {
- private String value;
-
- public PojoText() {
- this("");
- }
-
- public PojoText(String value) {
- this.value = value;
- }
-
- public String getValue() {
- return value;
- }
-
- public void setValue(String value) {
- this.value = value;
- }
-
- @Override
- public String toString() {
- return String.format("PojoText<%s>", this.value);
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
- return false;
- PojoText other = (PojoText) obj;
- if (value == null) {
- if (other.value != null)
- return false;
- } else if (!value.equals(other.value))
- return false;
- return true;
- }
-
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/AvroTypeSortIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/AvroTypeSortIT.java b/crunch/src/it/java/org/apache/crunch/lib/AvroTypeSortIT.java
deleted file mode 100644
index a832a5d..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/AvroTypeSortIT.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static junit.framework.Assert.assertEquals;
-import static org.apache.crunch.types.avro.Avros.ints;
-import static org.apache.crunch.types.avro.Avros.records;
-import static org.apache.crunch.types.avro.Avros.strings;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.List;
-
-import org.apache.avro.file.DataFileWriter;
-import org.apache.avro.specific.SpecificDatumWriter;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.At;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-/**
- * Test sorting Avro types by selected inner field
- */
-public class AvroTypeSortIT implements Serializable {
-
- private static final long serialVersionUID = 1344118240353796561L;
-
- private transient File avroFile;
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Before
- public void setUp() throws IOException {
- avroFile = File.createTempFile("avrotest", ".avro");
- }
-
- @After
- public void tearDown() {
- avroFile.delete();
- }
-
- @Test
- public void testSortAvroTypesBySelectedFields() throws Exception {
-
- MRPipeline pipeline = new MRPipeline(AvroTypeSortIT.class, tmpDir.getDefaultConfiguration());
-
- Person ccc10 = createPerson("CCC", 10);
- Person bbb20 = createPerson("BBB", 20);
- Person aaa30 = createPerson("AAA", 30);
-
- writeAvroFile(Lists.newArrayList(ccc10, bbb20, aaa30), avroFile);
-
- PCollection<Person> unsorted = pipeline.read(At.avroFile(avroFile.getAbsolutePath(), records(Person.class)));
-
- // Sort by Name
- MapFn<Person, String> nameExtractor = new MapFn<Person, String>() {
-
- @Override
- public String map(Person input) {
- return input.name.toString();
- }
- };
-
- PCollection<Person> sortedByName = unsorted.by(nameExtractor, strings()).groupByKey().ungroup().values();
-
- List<Person> sortedByNameList = Lists.newArrayList(sortedByName.materialize());
-
- assertEquals(3, sortedByNameList.size());
- assertEquals(aaa30, sortedByNameList.get(0));
- assertEquals(bbb20, sortedByNameList.get(1));
- assertEquals(ccc10, sortedByNameList.get(2));
-
- // Sort by Age
-
- MapFn<Person, Integer> ageExtractor = new MapFn<Person, Integer>() {
-
- @Override
- public Integer map(Person input) {
- return input.age;
- }
- };
-
- PCollection<Person> sortedByAge = unsorted.by(ageExtractor, ints()).groupByKey().ungroup().values();
-
- List<Person> sortedByAgeList = Lists.newArrayList(sortedByAge.materialize());
-
- assertEquals(3, sortedByAgeList.size());
- assertEquals(ccc10, sortedByAgeList.get(0));
- assertEquals(bbb20, sortedByAgeList.get(1));
- assertEquals(aaa30, sortedByAgeList.get(2));
-
- pipeline.done();
- }
-
- private void writeAvroFile(List<Person> people, File avroFile) throws IOException {
-
- FileOutputStream outputStream = new FileOutputStream(avroFile);
- SpecificDatumWriter<Person> writer = new SpecificDatumWriter<Person>(Person.class);
-
- DataFileWriter<Person> dataFileWriter = new DataFileWriter<Person>(writer);
- dataFileWriter.create(Person.SCHEMA$, outputStream);
- for (Person person : people) {
- dataFileWriter.append(person);
- }
- dataFileWriter.close();
- outputStream.close();
- }
-
- private Person createPerson(String name, int age) throws IOException {
-
- Person person = new Person();
- person.age = age;
- person.name = name;
- List<CharSequence> siblingNames = Lists.newArrayList();
- person.siblingnames = siblingNames;
-
- return person;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/CogroupIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/CogroupIT.java b/crunch/src/it/java/org/apache/crunch/lib/CogroupIT.java
deleted file mode 100644
index 4b28da7..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/CogroupIT.java
+++ /dev/null
@@ -1,112 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static org.hamcrest.Matchers.is;
-import static org.junit.Assert.assertThat;
-
-import java.io.IOException;
-import java.util.Collection;
-import java.util.Map;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.test.Tests;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableMap;
-
-
-public class CogroupIT {
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
- private MRPipeline pipeline;
- private PCollection<String> lines1;
- private PCollection<String> lines2;
-
-
- @Before
- public void setUp() throws IOException {
- pipeline = new MRPipeline(CogroupIT.class, tmpDir.getDefaultConfiguration());
- lines1 = pipeline.readTextFile(tmpDir.copyResourceFileName(Tests.resource(this, "src1.txt")));
- lines2 = pipeline.readTextFile(tmpDir.copyResourceFileName(Tests.resource(this, "src2.txt")));
- }
-
- @After
- public void tearDown() {
- pipeline.done();
- }
-
- @Test
- public void testCogroupWritables() {
- runCogroup(WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testCogroupAvro() {
- runCogroup(AvroTypeFamily.getInstance());
- }
-
- public void runCogroup(PTypeFamily ptf) {
- PTableType<String, String> tt = ptf.tableOf(ptf.strings(), ptf.strings());
-
- PTable<String, String> kv1 = lines1.parallelDo("kv1", new KeyValueSplit(), tt);
- PTable<String, String> kv2 = lines2.parallelDo("kv2", new KeyValueSplit(), tt);
-
- PTable<String, Pair<Collection<String>, Collection<String>>> cg = Cogroup.cogroup(kv1, kv2);
-
- Map<String, Pair<Collection<String>, Collection<String>>> actual = cg.materializeToMap();
-
- Map<String, Pair<Collection<String>, Collection<String>>> expected = ImmutableMap.of(
- "a", Pair.of(coll("1-1", "1-4"), coll()),
- "b", Pair.of(coll("1-2"), coll("2-1")),
- "c", Pair.of(coll("1-3"), coll("2-2", "2-3")),
- "d", Pair.of(coll(), coll("2-4"))
- );
-
- assertThat(actual, is(expected));
- }
-
-
- private static class KeyValueSplit extends DoFn<String, Pair<String, String>> {
- @Override
- public void process(String input, Emitter<Pair<String, String>> emitter) {
- String[] fields = input.split(",");
- emitter.emit(Pair.of(fields[0], fields[1]));
- }
- }
-
- private static Collection<String> coll(String... values) {
- return ImmutableList.copyOf(values);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/SecondarySortIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/SecondarySortIT.java b/crunch/src/it/java/org/apache/crunch/lib/SecondarySortIT.java
deleted file mode 100644
index 242f621..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/SecondarySortIT.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static org.apache.crunch.types.avro.Avros.*;
-import static org.junit.Assert.assertEquals;
-
-import java.io.Serializable;
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.From;
-import org.apache.crunch.test.CrunchTestSupport;
-import org.junit.Test;
-
-import com.google.common.base.Joiner;
-import com.google.common.collect.ImmutableList;
-
-
-public class SecondarySortIT extends CrunchTestSupport implements Serializable {
-
- @Test
- public void testSecondarySort() throws Exception {
- Pipeline p = new MRPipeline(SecondarySortIT.class, tempDir.getDefaultConfiguration());
- String inputFile = tempDir.copyResourceFileName("secondary_sort_input.txt");
-
- PTable<String, Pair<Integer, Integer>> in = p.read(From.textFile(inputFile))
- .parallelDo(new MapFn<String, Pair<String, Pair<Integer, Integer>>>() {
- @Override
- public Pair<String, Pair<Integer, Integer>> map(String input) {
- String[] pieces = input.split(",");
- return Pair.of(pieces[0],
- Pair.of(Integer.valueOf(pieces[1].trim()), Integer.valueOf(pieces[2].trim())));
- }
- }, tableOf(strings(), pairs(ints(), ints())));
- Iterable<String> lines = SecondarySort.sortAndApply(in, new MapFn<Pair<String, Iterable<Pair<Integer, Integer>>>, String>() {
- @Override
- public String map(Pair<String, Iterable<Pair<Integer, Integer>>> input) {
- Joiner j = Joiner.on(',');
- return j.join(input.first(), j.join(input.second()));
- }
- }, strings()).materialize();
- assertEquals(ImmutableList.of("one,[-5,10],[1,1],[2,-3]", "three,[0,-1]", "two,[1,7],[2,6],[4,5]"),
- ImmutableList.copyOf(lines));
- p.done();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/SetIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/SetIT.java b/crunch/src/it/java/org/apache/crunch/lib/SetIT.java
deleted file mode 100644
index d1300d2..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/SetIT.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Iterator;
-
-import org.apache.crunch.PCollection;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.At;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-import com.google.common.collect.Lists;
-
-@RunWith(value = Parameterized.class)
-public class SetIT {
-
- private PTypeFamily typeFamily;
-
- private Pipeline pipeline;
- private PCollection<String> set1;
- private PCollection<String> set2;
-
- public SetIT(PTypeFamily typeFamily) {
- this.typeFamily = typeFamily;
- }
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Parameters
- public static Collection<Object[]> data() {
- Object[][] data = new Object[][] { { WritableTypeFamily.getInstance() }, { AvroTypeFamily.getInstance() } };
- return Arrays.asList(data);
- }
-
- @Before
- public void setUp() throws IOException {
- String set1InputPath = tmpDir.copyResourceFileName("set1.txt");
- String set2InputPath = tmpDir.copyResourceFileName("set2.txt");
- pipeline = new MRPipeline(SetIT.class, tmpDir.getDefaultConfiguration());
- set1 = pipeline.read(At.textFile(set1InputPath, typeFamily.strings()));
- set2 = pipeline.read(At.textFile(set2InputPath, typeFamily.strings()));
- }
-
- @After
- public void tearDown() {
- pipeline.done();
- }
-
- @Test
- public void testDifference() throws Exception {
- PCollection<String> difference = Set.difference(set1, set2);
- assertEquals(Lists.newArrayList("b", "e"), Lists.newArrayList(difference.materialize()));
- }
-
- @Test
- public void testIntersection() throws Exception {
- PCollection<String> intersection = Set.intersection(set1, set2);
- assertEquals(Lists.newArrayList("a", "c"), Lists.newArrayList(intersection.materialize()));
- }
-
- @Test
- public void testComm() throws Exception {
- PCollection<Tuple3<String, String, String>> comm = Set.comm(set1, set2);
- Iterator<Tuple3<String, String, String>> i = comm.materialize().iterator();
- checkEquals(null, null, "a", i.next());
- checkEquals("b", null, null, i.next());
- checkEquals(null, null, "c", i.next());
- checkEquals(null, "d", null, i.next());
- checkEquals("e", null, null, i.next());
- assertFalse(i.hasNext());
- }
-
- private void checkEquals(String s1, String s2, String s3, Tuple3<String, String, String> tuple) {
- assertEquals("first string", s1, tuple.first());
- assertEquals("second string", s2, tuple.second());
- assertEquals("third string", s3, tuple.third());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/SortByValueIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/SortByValueIT.java b/crunch/src/it/java/org/apache/crunch/lib/SortByValueIT.java
deleted file mode 100644
index e19c7d3..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/SortByValueIT.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.From;
-import org.apache.crunch.lib.Sort.ColumnOrder;
-import org.apache.crunch.lib.Sort.Order;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableList;
-
-/**
- *
- */
-public class SortByValueIT {
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
- private static class SplitFn extends MapFn<String, Pair<String, Long>> {
- private String sep;
-
- public SplitFn(String sep) {
- this.sep = sep;
- }
-
- @Override
- public Pair<String, Long> map(String input) {
- String[] pieces = input.split(sep);
- return Pair.of(pieces[0], Long.valueOf(pieces[1]));
- }
- }
-
- @Test
- public void testSortByValueWritables() throws Exception {
- run(new MRPipeline(SortByValueIT.class), WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testSortByValueAvro() throws Exception {
- run(new MRPipeline(SortByValueIT.class), AvroTypeFamily.getInstance());
- }
-
- public void run(Pipeline pipeline, PTypeFamily ptf) throws Exception {
- String sbv = tmpDir.copyResourceFileName("sort_by_value.txt");
- PTable<String, Long> letterCounts = pipeline.read(From.textFile(sbv)).parallelDo(new SplitFn("\t"),
- ptf.tableOf(ptf.strings(), ptf.longs()));
- PCollection<Pair<String, Long>> sorted = Sort.sortPairs(
- letterCounts,
- new ColumnOrder(2, Order.DESCENDING),
- new ColumnOrder(1, Order.ASCENDING));
- assertEquals(
- ImmutableList.of(Pair.of("C", 3L), Pair.of("A", 2L), Pair.of("D", 2L), Pair.of("B", 1L), Pair.of("E", 1L)),
- ImmutableList.copyOf(sorted.materialize()));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/SortIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/SortIT.java b/crunch/src/it/java/org/apache/crunch/lib/SortIT.java
deleted file mode 100644
index bad4864..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/SortIT.java
+++ /dev/null
@@ -1,327 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static org.apache.crunch.lib.Sort.ColumnOrder.by;
-import static org.apache.crunch.lib.Sort.Order.ASCENDING;
-import static org.apache.crunch.lib.Sort.Order.DESCENDING;
-import static org.apache.crunch.test.StringWrapper.wrap;
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.lib.Sort.ColumnOrder;
-import org.apache.crunch.lib.Sort.Order;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class SortIT implements Serializable {
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testWritableSortAsc() throws Exception {
- runSingle(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), Order.ASCENDING,
- "A\tand this text as well");
- }
-
- @Test
- public void testWritableSortDesc() throws Exception {
- runSingle(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), Order.DESCENDING,
- "B\tthis doc has some text");
- }
-
- @Test
- public void testWritableSortAscDesc() throws Exception {
- runPair(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING), "A",
- "this doc has this text");
- }
-
- @Test
- public void testWritableSortSecondDescFirstAsc() throws Exception {
- runPair(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), by(2, DESCENDING), by(1, ASCENDING), "A",
- "this doc has this text");
- }
-
- @Test
- public void testWritableSortTripleAscDescAsc() throws Exception {
- runTriple(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING),
- by(3, ASCENDING), "A", "this", "doc");
- }
-
- @Test
- public void testWritableSortQuadAscDescAscDesc() throws Exception {
- runQuad(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING),
- by(3, ASCENDING), by(4, DESCENDING), "A", "this", "doc", "has");
- }
-
- @Test
- public void testWritableSortTupleNAscDesc() throws Exception {
- runTupleN(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(),
- new ColumnOrder[] { by(1, ASCENDING), by(2, DESCENDING) }, new String[] { "A", "this doc has this text" });
- }
-
- @Test
- public void testWritableSortTable() throws Exception {
- runTable(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), "A");
- }
-
- @Test
- public void testAvroSortAsc() throws Exception {
- runSingle(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(), Order.ASCENDING, "A\tand this text as well");
- }
-
- @Test
- public void testAvroSortDesc() throws Exception {
- runSingle(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(), Order.DESCENDING, "B\tthis doc has some text");
- }
-
- @Test
- public void testAvroSortPairAscDesc() throws Exception {
- runPair(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING), "A",
- "this doc has this text");
- }
-
- @Test
- public void testAvroSortPairSecondDescFirstAsc() throws Exception {
- runPair(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(), by(2, DESCENDING), by(1, ASCENDING), "A",
- "this doc has this text");
- }
-
- @Test
- public void testAvroSortTripleAscDescAsc() throws Exception {
- runTriple(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING),
- by(3, ASCENDING), "A", "this", "doc");
- }
-
- @Test
- public void testAvroSortQuadAscDescAscDesc() throws Exception {
- runQuad(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(), by(1, ASCENDING), by(2, DESCENDING),
- by(3, ASCENDING), by(4, DESCENDING), "A", "this", "doc", "has");
- }
-
- @Test
- public void testAvroSortTupleNAscDesc() throws Exception {
- runTupleN(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(),
- new ColumnOrder[] { by(1, ASCENDING), by(2, DESCENDING) }, new String[] { "A", "this doc has this text" });
- }
-
- @Test
- public void testAvroReflectSortPair() throws IOException {
- Pipeline pipeline = new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration());
- pipeline.enableDebug();
- String rsrc = tmpDir.copyResourceFileName("set2.txt");
- PCollection<Pair<String, StringWrapper>> in = pipeline.readTextFile(rsrc)
- .parallelDo(new MapFn<String, Pair<String, StringWrapper>>() {
-
- @Override
- public Pair<String, StringWrapper> map(String input) {
- return Pair.of(input, wrap(input));
- }
- }, Avros.pairs(Avros.strings(), Avros.reflects(StringWrapper.class)));
- PCollection<Pair<String, StringWrapper>> sorted = Sort.sort(in, Order.ASCENDING);
-
- List<Pair<String, StringWrapper>> expected = Lists.newArrayList();
- expected.add(Pair.of("a", wrap("a")));
- expected.add(Pair.of("c", wrap("c")));
- expected.add(Pair.of("d", wrap("d")));
-
- assertEquals(expected, Lists.newArrayList(sorted.materialize()));
- }
-
- @Test
- public void testAvroReflectSortTable() throws IOException {
- Pipeline pipeline = new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration());
- PTable<String, StringWrapper> unsorted = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt")).parallelDo(
- new MapFn<String, Pair<String, StringWrapper>>() {
-
- @Override
- public Pair<String, StringWrapper> map(String input) {
- return Pair.of(input, wrap(input));
- }
- }, Avros.tableOf(Avros.strings(), Avros.reflects(StringWrapper.class)));
-
- PTable<String, StringWrapper> sorted = Sort.sort(unsorted);
-
- List<Pair<String, StringWrapper>> expected = Lists.newArrayList();
- expected.add(Pair.of("a", wrap("a")));
- expected.add(Pair.of("c", wrap("c")));
- expected.add(Pair.of("d", wrap("d")));
-
- assertEquals(expected, Lists.newArrayList(sorted.materialize()));
- }
-
- @Test
- public void testAvroSortTable() throws Exception {
- runTable(new MRPipeline(SortIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance(), "A");
- }
-
- private void runSingle(Pipeline pipeline, PTypeFamily typeFamily, Order order, String firstLine) throws IOException {
- String inputPath = tmpDir.copyResourceFileName("docs.txt");
-
- PCollection<String> input = pipeline.readTextFile(inputPath);
- // following turns the input from Writables to required type family
- PCollection<String> input2 = input.parallelDo(new DoFn<String, String>() {
- @Override
- public void process(String input, Emitter<String> emitter) {
- emitter.emit(input);
- }
- }, typeFamily.strings());
- PCollection<String> sorted = Sort.sort(input2, order);
- Iterable<String> lines = sorted.materialize();
-
- assertEquals(firstLine, lines.iterator().next());
- pipeline.done(); // TODO: finally
- }
-
- private void runPair(Pipeline pipeline, PTypeFamily typeFamily, ColumnOrder first, ColumnOrder second,
- String firstField, String secondField) throws IOException {
- String inputPath = tmpDir.copyResourceFileName("docs.txt");
-
- PCollection<String> input = pipeline.readTextFile(inputPath);
- PTable<String, String> kv = input.parallelDo(new DoFn<String, Pair<String, String>>() {
- @Override
- public void process(String input, Emitter<Pair<String, String>> emitter) {
- String[] split = input.split("[\t]+");
- emitter.emit(Pair.of(split[0], split[1]));
- }
- }, typeFamily.tableOf(typeFamily.strings(), typeFamily.strings()));
- PCollection<Pair<String, String>> sorted = Sort.sortPairs(kv, first, second);
- List<Pair<String, String>> lines = Lists.newArrayList(sorted.materialize());
- Pair<String, String> l = lines.iterator().next();
- assertEquals(firstField, l.first());
- assertEquals(secondField, l.second());
- pipeline.done();
- }
-
- private void runTriple(Pipeline pipeline, PTypeFamily typeFamily, ColumnOrder first, ColumnOrder second,
- ColumnOrder third, String firstField, String secondField, String thirdField) throws IOException {
- String inputPath = tmpDir.copyResourceFileName("docs.txt");
-
- PCollection<String> input = pipeline.readTextFile(inputPath);
- PCollection<Tuple3<String, String, String>> kv = input.parallelDo(
- new DoFn<String, Tuple3<String, String, String>>() {
- @Override
- public void process(String input, Emitter<Tuple3<String, String, String>> emitter) {
- String[] split = input.split("[\t ]+");
- int len = split.length;
- emitter.emit(Tuple3.of(split[0], split[1 % len], split[2 % len]));
- }
- }, typeFamily.triples(typeFamily.strings(), typeFamily.strings(), typeFamily.strings()));
- PCollection<Tuple3<String, String, String>> sorted = Sort.sortTriples(kv, first, second, third);
- List<Tuple3<String, String, String>> lines = Lists.newArrayList(sorted.materialize());
- Tuple3<String, String, String> l = lines.iterator().next();
- assertEquals(firstField, l.first());
- assertEquals(secondField, l.second());
- assertEquals(thirdField, l.third());
- pipeline.done();
- }
-
- private void runQuad(Pipeline pipeline, PTypeFamily typeFamily, ColumnOrder first, ColumnOrder second,
- ColumnOrder third, ColumnOrder fourth, String firstField, String secondField, String thirdField,
- String fourthField) throws IOException {
- String inputPath = tmpDir.copyResourceFileName("docs.txt");
-
- PCollection<String> input = pipeline.readTextFile(inputPath);
- PCollection<Tuple4<String, String, String, String>> kv = input.parallelDo(
- new DoFn<String, Tuple4<String, String, String, String>>() {
- @Override
- public void process(String input, Emitter<Tuple4<String, String, String, String>> emitter) {
- String[] split = input.split("[\t ]+");
- int len = split.length;
- emitter.emit(Tuple4.of(split[0], split[1 % len], split[2 % len], split[3 % len]));
- }
- }, typeFamily.quads(typeFamily.strings(), typeFamily.strings(), typeFamily.strings(), typeFamily.strings()));
- PCollection<Tuple4<String, String, String, String>> sorted = Sort.sortQuads(kv, first, second, third, fourth);
- Iterable<Tuple4<String, String, String, String>> lines = sorted.materialize();
- Tuple4<String, String, String, String> l = lines.iterator().next();
- assertEquals(firstField, l.first());
- assertEquals(secondField, l.second());
- assertEquals(thirdField, l.third());
- assertEquals(fourthField, l.fourth());
- pipeline.done();
- }
-
- private void runTupleN(Pipeline pipeline, PTypeFamily typeFamily, ColumnOrder[] orders, String[] fields)
- throws IOException {
- String inputPath = tmpDir.copyResourceFileName("docs.txt");
-
- PCollection<String> input = pipeline.readTextFile(inputPath);
- PType[] types = new PType[orders.length];
- Arrays.fill(types, typeFamily.strings());
- PCollection<TupleN> kv = input.parallelDo(new DoFn<String, TupleN>() {
- @Override
- public void process(String input, Emitter<TupleN> emitter) {
- String[] split = input.split("[\t]+");
- emitter.emit(new TupleN(split));
- }
- }, typeFamily.tuples(types));
- PCollection<TupleN> sorted = Sort.sortTuples(kv, orders);
- Iterable<TupleN> lines = sorted.materialize();
- TupleN l = lines.iterator().next();
- int i = 0;
- for (String field : fields) {
- assertEquals(field, l.get(i++));
- }
- pipeline.done();
- }
-
- private void runTable(Pipeline pipeline, PTypeFamily typeFamily, String firstKey) throws IOException {
- String inputPath = tmpDir.copyResourceFileName("docs.txt");
-
- PCollection<String> input = pipeline.readTextFile(inputPath);
- PTable<String, String> table = input.parallelDo(new DoFn<String, Pair<String, String>>() {
- @Override
- public void process(String input, Emitter<Pair<String, String>> emitter) {
- String[] split = input.split("[\t]+");
- emitter.emit(Pair.of(split[0], split[1]));
- }
- }, typeFamily.tableOf(typeFamily.strings(), typeFamily.strings()));
-
- PTable<String, String> sorted = Sort.sort(table);
- Iterable<Pair<String, String>> lines = sorted.materialize();
- Pair<String, String> l = lines.iterator().next();
- assertEquals(firstKey, l.first());
- pipeline.done();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/SpecificAvroGroupByIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/SpecificAvroGroupByIT.java b/crunch/src/it/java/org/apache/crunch/lib/SpecificAvroGroupByIT.java
deleted file mode 100644
index 5292353..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/SpecificAvroGroupByIT.java
+++ /dev/null
@@ -1,119 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static junit.framework.Assert.assertEquals;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.List;
-
-import org.apache.avro.file.DataFileWriter;
-import org.apache.avro.specific.SpecificDatumWriter;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.At;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.avro.Avros;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-/**
- * Test {@link org.apache.crunch.types.avro.SafeAvroSerialization} with Specific Avro types
- */
-public class SpecificAvroGroupByIT implements Serializable {
-
- private static final long serialVersionUID = 1344118240353796561L;
-
- private transient File avroFile;
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
-
- @Before
- public void setUp() throws IOException {
- avroFile = File.createTempFile("avrotest", ".avro");
- }
-
- @After
- public void tearDown() {
- avroFile.delete();
- }
-
- @Test
- public void testGrouByWithSpecificAvroType() throws Exception {
- MRPipeline pipeline = new MRPipeline(SpecificAvroGroupByIT.class, tmpDir.getDefaultConfiguration());
- testSpecificAvro(pipeline);
- }
-
- public void testSpecificAvro(MRPipeline pipeline) throws Exception {
-
- createPersonAvroFile(avroFile);
-
- PCollection<Person> unsorted = pipeline.read(At.avroFile(avroFile.getAbsolutePath(), Avros.records(Person.class)));
-
- PTable<String, Person> sorted = unsorted.parallelDo(new MapFn<Person, Pair<String, Person>>() {
-
- @Override
- public Pair<String, Person> map(Person input) {
- String key = input.name.toString();
- return Pair.of(key, input);
-
- }
- }, Avros.tableOf(Avros.strings(), Avros.records(Person.class))).groupByKey().ungroup();
-
- List<Pair<String, Person>> outputPersonList = Lists.newArrayList(sorted.materialize());
-
- assertEquals(1, outputPersonList.size());
- assertEquals(String.class, outputPersonList.get(0).first().getClass());
- assertEquals(Person.class, outputPersonList.get(0).second().getClass());
-
- pipeline.done();
- }
-
- private void createPersonAvroFile(File avroFile) throws IOException {
-
- Person person = new Person();
- person.age = 40;
- person.name = "Bob";
- List<CharSequence> siblingNames = Lists.newArrayList();
- siblingNames.add("Bob" + "1");
- siblingNames.add("Bob" + "2");
- person.siblingnames = siblingNames;
-
- FileOutputStream outputStream = new FileOutputStream(avroFile);
- SpecificDatumWriter<Person> writer = new SpecificDatumWriter<Person>(Person.class);
-
- DataFileWriter<Person> dataFileWriter = new DataFileWriter<Person>(writer);
- dataFileWriter.create(Person.SCHEMA$, outputStream);
- dataFileWriter.append(person);
- dataFileWriter.close();
- outputStream.close();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/join/FullOuterJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/join/FullOuterJoinIT.java b/crunch/src/it/java/org/apache/crunch/lib/join/FullOuterJoinIT.java
deleted file mode 100644
index 63d594d..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/join/FullOuterJoinIT.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import static org.junit.Assert.assertTrue;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.PTypeFamily;
-
-public class FullOuterJoinIT extends JoinTester {
- @Override
- public void assertPassed(Iterable<Pair<String, Long>> lines) {
- boolean passed1 = false;
- boolean passed2 = false;
- boolean passed3 = false;
- for (Pair<String, Long> line : lines) {
- if ("wretched".equals(line.first()) && 24 == line.second()) {
- passed1 = true;
- }
- if ("againe".equals(line.first()) && 10 == line.second()) {
- passed2 = true;
- }
- if ("Montparnasse.".equals(line.first()) && 2 == line.second()) {
- passed3 = true;
- }
- }
- assertTrue(passed1);
- assertTrue(passed2);
- assertTrue(passed3);
- }
-
- @Override
- protected JoinFn<String, Long, Long> getJoinFn(PTypeFamily typeFamily) {
- return new FullOuterJoinFn<String, Long, Long>(typeFamily.strings(), typeFamily.longs());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/join/InnerJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/join/InnerJoinIT.java b/crunch/src/it/java/org/apache/crunch/lib/join/InnerJoinIT.java
deleted file mode 100644
index 4759050..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/join/InnerJoinIT.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import static org.junit.Assert.assertTrue;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.PTypeFamily;
-
-public class InnerJoinIT extends JoinTester {
- @Override
- public void assertPassed(Iterable<Pair<String, Long>> lines) {
- boolean passed1 = false;
- boolean passed2 = true;
- boolean passed3 = true;
- for (Pair<String, Long> line : lines) {
- if ("wretched".equals(line.first()) && 24 == line.second()) {
- passed1 = true;
- }
- if ("againe".equals(line.first())) {
- passed2 = false;
- }
- if ("Montparnasse.".equals(line.first())) {
- passed3 = false;
- }
- }
- assertTrue(passed1);
- assertTrue(passed2);
- assertTrue(passed3);
- }
-
- @Override
- protected JoinFn<String, Long, Long> getJoinFn(PTypeFamily typeFamily) {
- return new InnerJoinFn<String, Long, Long>(typeFamily.strings(), typeFamily.longs());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/join/JoinTester.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/join/JoinTester.java b/crunch/src/it/java/org/apache/crunch/lib/join/JoinTester.java
deleted file mode 100644
index 3e8ffda..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/join/JoinTester.java
+++ /dev/null
@@ -1,108 +0,0 @@
-/**
-R * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import java.io.IOException;
-import java.io.Serializable;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.lib.Aggregate;
-import org.apache.crunch.lib.Join;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.Rule;
-import org.junit.Test;
-
-public abstract class JoinTester implements Serializable {
- private static class WordSplit extends DoFn<String, String> {
- @Override
- public void process(String input, Emitter<String> emitter) {
- for (String word : input.split("\\s+")) {
- emitter.emit(word);
- }
- }
- }
-
- protected PTable<String, Long> join(PCollection<String> w1, PCollection<String> w2, PTypeFamily ptf) {
- PTableType<String, Long> ntt = ptf.tableOf(ptf.strings(), ptf.longs());
- PTable<String, Long> ws1 = Aggregate.count(w1.parallelDo("ws1", new WordSplit(), ptf.strings()));
- PTable<String, Long> ws2 = Aggregate.count(w2.parallelDo("ws2", new WordSplit(), ptf.strings()));
-
- PTable<String, Pair<Long, Long>> join = Join.join(ws1, ws2, getJoinFn(ptf));
-
- PTable<String, Long> sums = join.parallelDo("cnt", new DoFn<Pair<String, Pair<Long, Long>>, Pair<String, Long>>() {
- @Override
- public void process(Pair<String, Pair<Long, Long>> input, Emitter<Pair<String, Long>> emitter) {
- Pair<Long, Long> pair = input.second();
- long sum = (pair.first() != null ? pair.first() : 0) + (pair.second() != null ? pair.second() : 0);
- emitter.emit(Pair.of(input.first(), sum));
- }
- }, ntt);
-
- return sums;
- }
-
- protected void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
- String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
- String maughamInputPath = tmpDir.copyResourceFileName("maugham.txt");
-
- PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
- PCollection<String> maugham = pipeline.readTextFile(maughamInputPath);
- PTable<String, Long> joined = join(shakespeare, maugham, typeFamily);
- Iterable<Pair<String, Long>> lines = joined.materialize();
-
- assertPassed(lines);
-
- pipeline.done();
- }
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testWritableJoin() throws Exception {
- run(new MRPipeline(InnerJoinIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testAvroJoin() throws Exception {
- run(new MRPipeline(InnerJoinIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
- }
-
- /**
- * Used to check that the result of the join makes sense.
- *
- * @param lines
- * The result of the join.
- */
- public abstract void assertPassed(Iterable<Pair<String, Long>> lines);
-
- /**
- * @return The JoinFn to use.
- */
- protected abstract JoinFn<String, Long, Long> getJoinFn(PTypeFamily typeFamily);
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/join/LeftOuterJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/join/LeftOuterJoinIT.java b/crunch/src/it/java/org/apache/crunch/lib/join/LeftOuterJoinIT.java
deleted file mode 100644
index 4ad2a81..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/join/LeftOuterJoinIT.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import static org.junit.Assert.assertTrue;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.PTypeFamily;
-
-public class LeftOuterJoinIT extends JoinTester {
- @Override
- public void assertPassed(Iterable<Pair<String, Long>> lines) {
- boolean passed1 = false;
- boolean passed2 = false;
- boolean passed3 = true;
- for (Pair<String, Long> line : lines) {
- if ("wretched".equals(line.first()) && 24 == line.second()) {
- passed1 = true;
- }
- if ("againe".equals(line.first()) && 10 == line.second()) {
- passed2 = true;
- }
- if ("Montparnasse.".equals(line.first())) {
- passed3 = false;
- }
- }
- assertTrue(passed1);
- assertTrue(passed2);
- assertTrue(passed3);
- }
-
- @Override
- protected JoinFn<String, Long, Long> getJoinFn(PTypeFamily typeFamily) {
- return new LeftOuterJoinFn<String, Long, Long>(typeFamily.strings(), typeFamily.longs());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/join/MapsideJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/join/MapsideJoinIT.java b/crunch/src/it/java/org/apache/crunch/lib/join/MapsideJoinIT.java
deleted file mode 100644
index 8bb5586..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/join/MapsideJoinIT.java
+++ /dev/null
@@ -1,158 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.PipelineResult;
-import org.apache.crunch.fn.FilterFns;
-import org.apache.crunch.fn.MapValuesFn;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.writable.Writables;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class MapsideJoinIT {
-
- private static String saveTempDir;
-
- @BeforeClass
- public static void setUpClass(){
-
- // Ensure a consistent temporary directory for use of the DistributedCache.
-
- // The DistributedCache technically isn't supported when running in local mode, and the default
- // temporary directiory "/tmp" is used as its location. This typically only causes an issue when
- // running integration tests on Mac OS X, as OS X doesn't use "/tmp" as it's default temporary
- // directory. The following call ensures that "/tmp" is used as the temporary directory on all platforms.
- saveTempDir = System.setProperty("java.io.tmpdir", "/tmp");
- }
-
- @AfterClass
- public static void tearDownClass(){
- System.setProperty("java.io.tmpdir", saveTempDir);
- }
-
- private static class LineSplitter extends MapFn<String, Pair<Integer, String>> {
- @Override
- public Pair<Integer, String> map(String input) {
- String[] fields = input.split("\\|");
- return Pair.of(Integer.parseInt(fields[0]), fields[1]);
- }
- }
-
- private static class CapOrdersFn extends MapValuesFn<Integer, String, String> {
- @Override
- public String map(String v) {
- return v.toUpperCase();
- }
- }
-
- private static class ConcatValuesFn extends MapValuesFn<Integer, Pair<String, String>, String> {
- @Override
- public String map(Pair<String, String> v) {
- return v.toString();
- }
- }
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testMapSideJoin_MemPipeline() {
- runMapsideJoin(MemPipeline.getInstance(), true);
- }
-
- @Test
- public void testMapsideJoin_RightSideIsEmpty() throws IOException {
- MRPipeline pipeline = new MRPipeline(MapsideJoinIT.class, tmpDir.getDefaultConfiguration());
- PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt");
- PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt");
-
- PTable<Integer, String> filteredOrderTable = orderTable
- .parallelDo(FilterFns.<Pair<Integer, String>>REJECT_ALL(), orderTable.getPTableType());
-
- PTable<Integer, Pair<String, String>> joined = MapsideJoin.join(customerTable, filteredOrderTable);
-
- List<Pair<Integer, Pair<String, String>>> materializedJoin = Lists.newArrayList(joined.materialize());
-
- assertTrue(materializedJoin.isEmpty());
- }
-
- @Test
- public void testMapsideJoin() throws IOException {
- runMapsideJoin(new MRPipeline(MapsideJoinIT.class, tmpDir.getDefaultConfiguration()), false);
- }
-
- private void runMapsideJoin(Pipeline pipeline, boolean inMemory) {
- PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt");
- PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt");
-
- PTable<Integer, String> custOrders = MapsideJoin.join(customerTable, orderTable)
- .parallelDo("concat", new ConcatValuesFn(), Writables.tableOf(Writables.ints(), Writables.strings()));
-
- PTable<Integer, String> ORDER_TABLE = orderTable.parallelDo(new CapOrdersFn(), orderTable.getPTableType());
-
- PTable<Integer, Pair<String, String>> joined = MapsideJoin.join(custOrders, ORDER_TABLE);
-
- List<Pair<Integer, Pair<String, String>>> expectedJoinResult = Lists.newArrayList();
- expectedJoinResult.add(Pair.of(111, Pair.of("[John Doe,Corn flakes]", "CORN FLAKES")));
- expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PAPER")));
- expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PLUNGER")));
- expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PAPER")));
- expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PLUNGER")));
- expectedJoinResult.add(Pair.of(333, Pair.of("[Someone Else,Toilet brush]", "TOILET BRUSH")));
- Iterable<Pair<Integer, Pair<String, String>>> iter = joined.materialize();
-
- PipelineResult res = pipeline.run();
- if (!inMemory) {
- assertEquals(2, res.getStageResults().size());
- }
-
- List<Pair<Integer, Pair<String, String>>> joinedResultList = Lists.newArrayList(iter);
- Collections.sort(joinedResultList);
-
- assertEquals(expectedJoinResult, joinedResultList);
- }
-
- private PTable<Integer, String> readTable(Pipeline pipeline, String filename) {
- try {
- return pipeline.readTextFile(tmpDir.copyResourceFileName(filename)).parallelDo("asTable", new LineSplitter(),
- Writables.tableOf(Writables.ints(), Writables.strings()));
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
-}
[10/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/CrunchOutputs.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/CrunchOutputs.java b/crunch/src/main/java/org/apache/crunch/io/CrunchOutputs.java
deleted file mode 100644
index ccf4fb5..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/CrunchOutputs.java
+++ /dev/null
@@ -1,184 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.hadoop.mapreduce.TaskAttemptContextFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.OutputFormat;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-import org.apache.hadoop.util.ReflectionUtils;
-
-import com.google.common.base.Joiner;
-import com.google.common.base.Splitter;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Map;
-
-/**
- * An analogue of {@link CrunchInputs} for handling multiple {@code OutputFormat} instances
- * writing to multiple files within a single MapReduce job.
- */
-public class CrunchOutputs<K, V> {
- public static final String CRUNCH_OUTPUTS = "crunch.outputs.dir";
-
- private static final char RECORD_SEP = ',';
- private static final char FIELD_SEP = ';';
- private static final Joiner JOINER = Joiner.on(FIELD_SEP);
- private static final Splitter SPLITTER = Splitter.on(FIELD_SEP);
-
- public static void addNamedOutput(Job job, String name,
- Class<? extends OutputFormat> outputFormatClass,
- Class keyClass, Class valueClass) {
- addNamedOutput(job, name, FormatBundle.forOutput(outputFormatClass), keyClass, valueClass);
- }
-
- public static void addNamedOutput(Job job, String name,
- FormatBundle<? extends OutputFormat> outputBundle,
- Class keyClass, Class valueClass) {
- Configuration conf = job.getConfiguration();
- String inputs = JOINER.join(name, outputBundle.serialize(), keyClass.getName(), valueClass.getName());
- String existing = conf.get(CRUNCH_OUTPUTS);
- conf.set(CRUNCH_OUTPUTS, existing == null ? inputs : existing + RECORD_SEP + inputs);
- }
-
- private static class OutputConfig<K, V> {
- public FormatBundle<OutputFormat<K, V>> bundle;
- public Class<K> keyClass;
- public Class<V> valueClass;
-
- public OutputConfig(FormatBundle<OutputFormat<K, V>> bundle,
- Class<K> keyClass, Class<V> valueClass) {
- this.bundle = bundle;
- this.keyClass = keyClass;
- this.valueClass = valueClass;
- }
- }
-
- private static Map<String, OutputConfig> getNamedOutputs(
- TaskInputOutputContext<?, ?, ?, ?> context) {
- Map<String, OutputConfig> out = Maps.newHashMap();
- Configuration conf = context.getConfiguration();
- for (String input : Splitter.on(RECORD_SEP).split(conf.get(CRUNCH_OUTPUTS))) {
- List<String> fields = Lists.newArrayList(SPLITTER.split(input));
- String name = fields.get(0);
- FormatBundle<OutputFormat> bundle = FormatBundle.fromSerialized(fields.get(1),
- OutputFormat.class);
- try {
- Class<?> keyClass = Class.forName(fields.get(2));
- Class<?> valueClass = Class.forName(fields.get(3));
- out.put(name, new OutputConfig(bundle, keyClass, valueClass));
- } catch (ClassNotFoundException e) {
- throw new CrunchRuntimeException(e);
- }
- }
- return out;
- }
-
- private static final String BASE_OUTPUT_NAME = "mapreduce.output.basename";
- private static final String COUNTERS_GROUP = CrunchOutputs.class.getName();
-
- private TaskInputOutputContext<?, ?, K, V> baseContext;
- private Map<String, OutputConfig> namedOutputs;
- private Map<String, RecordWriter<K, V>> recordWriters;
- private Map<String, TaskAttemptContext> taskContextCache;
-
- /**
- * Creates and initializes multiple outputs support,
- * it should be instantiated in the Mapper/Reducer setup method.
- *
- * @param context the TaskInputOutputContext object
- */
- public CrunchOutputs(TaskInputOutputContext<?, ?, K, V> context) {
- this.baseContext = context;
- namedOutputs = getNamedOutputs(context);
- recordWriters = Maps.newHashMap();
- taskContextCache = Maps.newHashMap();
- }
-
- @SuppressWarnings("unchecked")
- public void write(String namedOutput, K key, V value)
- throws IOException, InterruptedException {
- if (!namedOutputs.containsKey(namedOutput)) {
- throw new IllegalArgumentException("Undefined named output '" +
- namedOutput + "'");
- }
- TaskAttemptContext taskContext = getContext(namedOutput);
- baseContext.getCounter(COUNTERS_GROUP, namedOutput).increment(1);
- getRecordWriter(taskContext, namedOutput).write(key, value);
- }
-
- public void close() throws IOException, InterruptedException {
- for (RecordWriter<?, ?> writer : recordWriters.values()) {
- writer.close(baseContext);
- }
- }
-
- private TaskAttemptContext getContext(String nameOutput) throws IOException {
- TaskAttemptContext taskContext = taskContextCache.get(nameOutput);
- if (taskContext != null) {
- return taskContext;
- }
-
- // The following trick leverages the instantiation of a record writer via
- // the job thus supporting arbitrary output formats.
- OutputConfig outConfig = namedOutputs.get(nameOutput);
- Configuration conf = new Configuration(baseContext.getConfiguration());
- Job job = new Job(conf);
- job.getConfiguration().set("crunch.namedoutput", nameOutput);
- job.setOutputFormatClass(outConfig.bundle.getFormatClass());
- job.setOutputKeyClass(outConfig.keyClass);
- job.setOutputValueClass(outConfig.valueClass);
- outConfig.bundle.configure(job.getConfiguration());
- taskContext = TaskAttemptContextFactory.create(
- job.getConfiguration(), baseContext.getTaskAttemptID());
-
- taskContextCache.put(nameOutput, taskContext);
- return taskContext;
- }
-
- private synchronized RecordWriter<K, V> getRecordWriter(
- TaskAttemptContext taskContext, String namedOutput)
- throws IOException, InterruptedException {
- // look for record-writer in the cache
- RecordWriter<K, V> writer = recordWriters.get(namedOutput);
-
- // If not in cache, create a new one
- if (writer == null) {
- // get the record writer from context output format
- taskContext.getConfiguration().set(BASE_OUTPUT_NAME, namedOutput);
- try {
- OutputFormat format = ReflectionUtils.newInstance(
- taskContext.getOutputFormatClass(),
- taskContext.getConfiguration());
- writer = format.getRecordWriter(taskContext);
- } catch (ClassNotFoundException e) {
- throw new IOException(e);
- }
- recordWriters.put(namedOutput, writer);
- }
-
- return writer;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/FileNamingScheme.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/FileNamingScheme.java b/crunch/src/main/java/org/apache/crunch/io/FileNamingScheme.java
deleted file mode 100644
index cf93651..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/FileNamingScheme.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import java.io.IOException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-
-/**
- * Encapsulates rules for naming output files. It is the responsibility of
- * implementors to avoid file name collisions.
- */
-public interface FileNamingScheme {
-
- /**
- * Get the output file name for a map task. Note that the implementation is
- * responsible for avoiding naming collisions.
- *
- * @param configuration The configuration of the job for which the map output
- * is being written
- * @param outputDirectory The directory where the output will be written
- * @return The filename for the output of the map task
- * @throws IOException if an exception occurs while accessing the output file
- * system
- */
- String getMapOutputName(Configuration configuration, Path outputDirectory) throws IOException;
-
- /**
- * Get the output file name for a reduce task. Note that the implementation is
- * responsible for avoiding naming collisions.
- *
- * @param configuration The configuration of the job for which output is being
- * written
- * @param outputDirectory The directory where the file will be written
- * @param partitionId The partition of the reduce task being output
- * @return The filename for the output of the reduce task
- * @throws IOException if an exception occurs while accessing output file
- * system
- */
- String getReduceOutputName(Configuration configuration, Path outputDirectory, int partitionId) throws IOException;
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/FileReaderFactory.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/FileReaderFactory.java b/crunch/src/main/java/org/apache/crunch/io/FileReaderFactory.java
deleted file mode 100644
index 5cccb7b..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/FileReaderFactory.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import java.util.Iterator;
-
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-
-public interface FileReaderFactory<T> {
- Iterator<T> read(FileSystem fs, Path path);
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/FormatBundle.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/FormatBundle.java b/crunch/src/main/java/org/apache/crunch/io/FormatBundle.java
deleted file mode 100644
index d969009..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/FormatBundle.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
-import java.io.Serializable;
-import java.util.Map;
-
-import org.apache.commons.codec.binary.Base64;
-import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.InputFormat;
-import org.apache.hadoop.mapreduce.OutputFormat;
-
-import com.google.common.collect.Maps;
-
-/**
- * A combination of an {@link InputFormat} or {@link OutputFormat} and any extra
- * configuration information that format class needs to run.
- *
- * <p>The {@code FormatBundle} allow us to let different formats act as
- * if they are the only format that exists in a particular MapReduce job, even
- * when we have multiple types of inputs and outputs within a single job.
- */
-public class FormatBundle<K> implements Serializable {
-
- private Class<K> formatClass;
- private Map<String, String> extraConf;
-
- public static <T> FormatBundle<T> fromSerialized(String serialized, Class<T> clazz) {
- ByteArrayInputStream bais = new ByteArrayInputStream(Base64.decodeBase64(serialized));
- try {
- ObjectInputStream ois = new ObjectInputStream(bais);
- FormatBundle<T> bundle = (FormatBundle<T>) ois.readObject();
- ois.close();
- return bundle;
- } catch (IOException e) {
- throw new RuntimeException(e);
- } catch (ClassNotFoundException e) {
- throw new RuntimeException(e);
- }
- }
-
- public static <T extends InputFormat<?, ?>> FormatBundle<T> forInput(Class<T> inputFormatClass) {
- return new FormatBundle<T>(inputFormatClass);
- }
-
- public static <T extends OutputFormat<?, ?>> FormatBundle<T> forOutput(Class<T> inputFormatClass) {
- return new FormatBundle<T>(inputFormatClass);
- }
-
- private FormatBundle(Class<K> formatClass) {
- this.formatClass = formatClass;
- this.extraConf = Maps.newHashMap();
- }
-
- public FormatBundle<K> set(String key, String value) {
- this.extraConf.put(key, value);
- return this;
- }
-
- public Class<K> getFormatClass() {
- return formatClass;
- }
-
- public Configuration configure(Configuration conf) {
- for (Map.Entry<String, String> e : extraConf.entrySet()) {
- conf.set(e.getKey(), e.getValue());
- }
- return conf;
- }
-
- public String serialize() {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- try {
- ObjectOutputStream oos = new ObjectOutputStream(baos);
- oos.writeObject(this);
- oos.close();
- return Base64.encodeBase64String(baos.toByteArray());
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
- public String getName() {
- return formatClass.getSimpleName();
- }
-
- @Override
- public int hashCode() {
- return new HashCodeBuilder().append(formatClass).append(extraConf).toHashCode();
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !(other instanceof FormatBundle)) {
- return false;
- }
- FormatBundle<K> oib = (FormatBundle<K>) other;
- return formatClass.equals(oib.formatClass) && extraConf.equals(oib.extraConf);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/From.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/From.java b/crunch/src/main/java/org/apache/crunch/io/From.java
deleted file mode 100644
index e4cfb6a..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/From.java
+++ /dev/null
@@ -1,324 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import org.apache.avro.specific.SpecificRecord;
-import org.apache.crunch.Source;
-import org.apache.crunch.TableSource;
-import org.apache.crunch.io.avro.AvroFileSource;
-import org.apache.crunch.io.impl.FileTableSourceImpl;
-import org.apache.crunch.io.seq.SeqFileSource;
-import org.apache.crunch.io.seq.SeqFileTableSource;
-import org.apache.crunch.io.text.TextFileSource;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroType;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.crunch.types.writable.Writables;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-
-/**
- * <p>Static factory methods for creating common {@link Source} types.</p>
- *
- * <p>The {@code From} class is intended to provide a literate API for creating
- * Crunch pipelines from common input file types.
- *
- * <code>
- * Pipeline pipeline = new MRPipeline(this.getClass());
- *
- * // Reference the lines of a text file by wrapping the TextInputFormat class.
- * PCollection<String> lines = pipeline.read(From.textFile("/path/to/myfiles"));
- *
- * // Reference entries from a sequence file where the key is a LongWritable and the
- * // value is a custom Writable class.
- * PTable<LongWritable, MyWritable> table = pipeline.read(From.sequenceFile(
- * "/path/to/seqfiles", LongWritable.class, MyWritable.class));
- *
- * // Reference the records from an Avro file, where MyAvroObject implements Avro's
- * // SpecificRecord interface.
- * PCollection<MyAvroObject> myObjects = pipeline.read(From.avroFile("/path/to/avrofiles",
- * MyAvroObject.class));
- *
- * // References the key-value pairs from a custom extension of FileInputFormat:
- * PTable<KeyWritable, ValueWritable> custom = pipeline.read(From.formattedFile(
- * "/custom", MyFileInputFormat.class, KeyWritable.class, ValueWritable.class));
- * </code>
- * </p>
- */
-public class From {
-
- /**
- * Creates a {@code TableSource<K, V>} for reading data from files that have custom
- * {@code FileInputFormat<K, V>} implementations not covered by the provided {@code TableSource}
- * and {@code Source} factory methods.
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param formatClass The {@code FileInputFormat} implementation
- * @param keyClass The {@code Writable} to use for the key
- * @param valueClass The {@code Writable} to use for the value
- * @return A new {@code TableSource<K, V>} instance
- */
- public static <K extends Writable, V extends Writable> TableSource<K, V> formattedFile(
- String pathName, Class<? extends FileInputFormat<K, V>> formatClass,
- Class<K> keyClass, Class<V> valueClass) {
- return formattedFile(new Path(pathName), formatClass, keyClass, valueClass);
- }
-
- /**
- * Creates a {@code TableSource<K, V>} for reading data from files that have custom
- * {@code FileInputFormat<K, V>} implementations not covered by the provided {@code TableSource}
- * and {@code Source} factory methods.
- *
- * @param The {@code Path} to the data
- * @param formatClass The {@code FileInputFormat} implementation
- * @param keyClass The {@code Writable} to use for the key
- * @param valueClass The {@code Writable} to use for the value
- * @return A new {@code TableSource<K, V>} instance
- */
- public static <K extends Writable, V extends Writable> TableSource<K, V> formattedFile(
- Path path, Class<? extends FileInputFormat<K, V>> formatClass,
- Class<K> keyClass, Class<V> valueClass) {
- return formattedFile(path, formatClass, Writables.writables(keyClass),
- Writables.writables(valueClass));
- }
-
- /**
- * Creates a {@code TableSource<K, V>} for reading data from files that have custom
- * {@code FileInputFormat} implementations not covered by the provided {@code TableSource}
- * and {@code Source} factory methods.
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param formatClass The {@code FileInputFormat} implementation
- * @param keyType The {@code PType} to use for the key
- * @param valueType The {@code PType} to use for the value
- * @return A new {@code TableSource<K, V>} instance
- */
- public static <K, V> TableSource<K, V> formattedFile(String pathName,
- Class<? extends FileInputFormat<?, ?>> formatClass,
- PType<K> keyType, PType<V> valueType) {
- return formattedFile(new Path(pathName), formatClass, keyType, valueType);
- }
-
- /**
- * Creates a {@code TableSource<K, V>} for reading data from files that have custom
- * {@code FileInputFormat} implementations not covered by the provided {@code TableSource}
- * and {@code Source} factory methods.
- *
- * @param The {@code Path} to the data
- * @param formatClass The {@code FileInputFormat} implementation
- * @param keyType The {@code PType} to use for the key
- * @param valueType The {@code PType} to use for the value
- * @return A new {@code TableSource<K, V>} instance
- */
- public static <K, V> TableSource<K, V> formattedFile(Path path,
- Class<? extends FileInputFormat<?, ?>> formatClass,
- PType<K> keyType, PType<V> valueType) {
- PTableType<K, V> tableType = keyType.getFamily().tableOf(keyType, valueType);
- return new FileTableSourceImpl<K, V>(path, tableType, formatClass);
- }
-
- /**
- * Creates a {@code Source<T>} instance from the Avro file(s) at the given path name.
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param avroClass The subclass of {@code SpecificRecord} to use for the Avro file
- * @return A new {@code Source<T>} instance
- */
- public static <T extends SpecificRecord> Source<T> avroFile(String pathName, Class<T> avroClass) {
- return avroFile(new Path(pathName), avroClass);
- }
-
- /**
- * Creates a {@code Source<T>} instance from the Avro file(s) at the given {@code Path}.
- *
- * @param path The {@code Path} to the data
- * @param avroClass The subclass of {@code SpecificRecord} to use for the Avro file
- * @return A new {@code Source<T>} instance
- */
- public static <T extends SpecificRecord> Source<T> avroFile(Path path, Class<T> avroClass) {
- return avroFile(path, Avros.specifics(avroClass));
- }
-
- /**
- * Creates a {@code Source<T>} instance from the Avro file(s) at the given path name.
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param avroType The {@code AvroType} for the Avro records
- * @return A new {@code Source<T>} instance
- */
- public static <T> Source<T> avroFile(String pathName, AvroType<T> avroType) {
- return avroFile(new Path(pathName), avroType);
- }
-
- /**
- * Creates a {@code Source<T>} instance from the Avro file(s) at the given {@code Path}.
- *
- * @param path The {@code Path} to the data
- * @param avroType The {@code AvroType} for the Avro records
- * @return A new {@code Source<T>} instance
- */
- public static <T> Source<T> avroFile(Path path, AvroType<T> avroType) {
- return new AvroFileSource<T>(path, avroType);
- }
-
- /**
- * Creates a {@code Source<T>} instance from the SequenceFile(s) at the given path name
- * from the value field of each key-value pair in the SequenceFile(s).
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param valueClass The {@code Writable} type for the value of the SequenceFile entry
- * @return A new {@code Source<T>} instance
- */
- public static <T extends Writable> Source<T> sequenceFile(String pathName, Class<T> valueClass) {
- return sequenceFile(new Path(pathName), valueClass);
- }
-
- /**
- * Creates a {@code Source<T>} instance from the SequenceFile(s) at the given {@code Path}
- * from the value field of each key-value pair in the SequenceFile(s).
- *
- * @param path The {@code Path} to the data
- * @param valueClass The {@code Writable} type for the value of the SequenceFile entry
- * @return A new {@code Source<T>} instance
- */
- public static <T extends Writable> Source<T> sequenceFile(Path path, Class<T> valueClass) {
- return sequenceFile(path, Writables.writables(valueClass));
- }
-
- /**
- * Creates a {@code Source<T>} instance from the SequenceFile(s) at the given path name
- * from the value field of each key-value pair in the SequenceFile(s).
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param ptype The {@code PType} for the value of the SequenceFile entry
- * @return A new {@code Source<T>} instance
- */
- public static <T> Source<T> sequenceFile(String pathName, PType<T> ptype) {
- return sequenceFile(new Path(pathName), ptype);
- }
-
- /**
- * Creates a {@code Source<T>} instance from the SequenceFile(s) at the given {@code Path}
- * from the value field of each key-value pair in the SequenceFile(s).
- *
- * @param path The {@code Path} to the data
- * @param ptype The {@code PType} for the value of the SequenceFile entry
- * @return A new {@code Source<T>} instance
- */
- public static <T> Source<T> sequenceFile(Path path, PType<T> ptype) {
- return new SeqFileSource<T>(path, ptype);
- }
-
- /**
- * Creates a {@code TableSource<K, V>} instance for the SequenceFile(s) at the given path name.
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param keyClass The {@code Writable} subclass for the key of the SequenceFile entry
- * @param valueClass The {@code Writable} subclass for the value of the SequenceFile entry
- * @return A new {@code SourceTable<K, V>} instance
- */
- public static <K extends Writable, V extends Writable> TableSource<K, V> sequenceFile(
- String pathName, Class<K> keyClass, Class<V> valueClass) {
- return sequenceFile(new Path(pathName), keyClass, valueClass);
- }
-
- /**
- * Creates a {@code TableSource<K, V>} instance for the SequenceFile(s) at the given {@code Path}.
- *
- * @param path The {@code Path} to the data
- * @param keyClass The {@code Writable} subclass for the key of the SequenceFile entry
- * @param valueClass The {@code Writable} subclass for the value of the SequenceFile entry
- * @return A new {@code SourceTable<K, V>} instance
- */
- public static <K extends Writable, V extends Writable> TableSource<K, V> sequenceFile(
- Path path, Class<K> keyClass, Class<V> valueClass) {
- return sequenceFile(path, Writables.writables(keyClass), Writables.writables(valueClass));
- }
-
- /**
- * Creates a {@code TableSource<K, V>} instance for the SequenceFile(s) at the given path name.
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param keyType The {@code PType} for the key of the SequenceFile entry
- * @param valueType The {@code PType} for the value of the SequenceFile entry
- * @return A new {@code SourceTable<K, V>} instance
- */
- public static <K, V> TableSource<K, V> sequenceFile(String pathName, PType<K> keyType, PType<V> valueType) {
- return sequenceFile(new Path(pathName), keyType, valueType);
- }
-
- /**
- * Creates a {@code TableSource<K, V>} instance for the SequenceFile(s) at the given {@code Path}.
- *
- * @param path The {@code Path} to the data
- * @param keyType The {@code PType} for the key of the SequenceFile entry
- * @param valueType The {@code PType} for the value of the SequenceFile entry
- * @return A new {@code SourceTable<K, V>} instance
- */
- public static <K, V> TableSource<K, V> sequenceFile(Path path, PType<K> keyType, PType<V> valueType) {
- PTypeFamily ptf = keyType.getFamily();
- return new SeqFileTableSource<K, V>(path, ptf.tableOf(keyType, valueType));
- }
-
- /**
- * Creates a {@code Source<String>} instance for the text file(s) at the given path name.
- *
- * @param pathName The name of the path to the data on the filesystem
- * @return A new {@code Source<String>} instance
- */
- public static Source<String> textFile(String pathName) {
- return textFile(new Path(pathName));
- }
-
- /**
- * Creates a {@code Source<String>} instance for the text file(s) at the given {@code Path}.
- *
- * @param path The {@code Path} to the data
- * @return A new {@code Source<String>} instance
- */
- public static Source<String> textFile(Path path) {
- return textFile(path, Writables.strings());
- }
-
- /**
- * Creates a {@code Source<T>} instance for the text file(s) at the given path name using
- * the provided {@code PType<T>} to convert the input text.
- *
- * @param pathName The name of the path to the data on the filesystem
- * @param ptype The {@code PType<T>} to use to process the input text
- * @return A new {@code Source<T>} instance
- */
- public static <T> Source<T> textFile(String pathName, PType<T> ptype) {
- return textFile(new Path(pathName), ptype);
- }
-
- /**
- * Creates a {@code Source<T>} instance for the text file(s) at the given {@code Path} using
- * the provided {@code PType<T>} to convert the input text.
- *
- * @param path The {@code Path} to the data
- * @param ptype The {@code PType<T>} to use to process the input text
- * @return A new {@code Source<T>} instance
- */
- public static <T> Source<T> textFile(Path path, PType<T> ptype) {
- return new TextFileSource<T>(path, ptype);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/MapReduceTarget.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/MapReduceTarget.java b/crunch/src/main/java/org/apache/crunch/io/MapReduceTarget.java
deleted file mode 100644
index b484103..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/MapReduceTarget.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import org.apache.crunch.Target;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-
-public interface MapReduceTarget extends Target {
- void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name);
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/OutputHandler.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/OutputHandler.java b/crunch/src/main/java/org/apache/crunch/io/OutputHandler.java
deleted file mode 100644
index 01d7f99..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/OutputHandler.java
+++ /dev/null
@@ -1,25 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import org.apache.crunch.Target;
-import org.apache.crunch.types.PType;
-
-public interface OutputHandler {
- boolean configure(Target target, PType<?> ptype);
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/PathTarget.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/PathTarget.java b/crunch/src/main/java/org/apache/crunch/io/PathTarget.java
deleted file mode 100644
index 7a35209..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/PathTarget.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import org.apache.hadoop.fs.Path;
-
-/**
- * A target whose output goes to a given path on a file system.
- */
-public interface PathTarget extends MapReduceTarget {
-
- Path getPath();
-
- /**
- * Get the naming scheme to be used for outputs being written to an output
- * path.
- *
- * @return the naming scheme to be used
- */
- FileNamingScheme getFileNamingScheme();
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/PathTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/PathTargetImpl.java b/crunch/src/main/java/org/apache/crunch/io/PathTargetImpl.java
deleted file mode 100644
index 0be3f9a..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/PathTargetImpl.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.OutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-
-public abstract class PathTargetImpl implements PathTarget {
-
- private final Path path;
- private final Class<OutputFormat> outputFormatClass;
- private final Class keyClass;
- private final Class valueClass;
-
- public PathTargetImpl(String path, Class<OutputFormat> outputFormatClass, Class keyClass, Class valueClass) {
- this(new Path(path), outputFormatClass, keyClass, valueClass);
- }
-
- public PathTargetImpl(Path path, Class<OutputFormat> outputFormatClass, Class keyClass, Class valueClass) {
- this.path = path;
- this.outputFormatClass = outputFormatClass;
- this.keyClass = keyClass;
- this.valueClass = valueClass;
- }
-
- @Override
- public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) {
- try {
- FileOutputFormat.setOutputPath(job, path);
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- if (name == null) {
- job.setOutputFormatClass(outputFormatClass);
- job.setOutputKeyClass(keyClass);
- job.setOutputValueClass(valueClass);
- } else {
- CrunchOutputs.addNamedOutput(job, name, outputFormatClass, keyClass, valueClass);
- }
- }
-
- @Override
- public Path getPath() {
- return path;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/ReadableSource.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/ReadableSource.java b/crunch/src/main/java/org/apache/crunch/io/ReadableSource.java
deleted file mode 100644
index 0407167..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/ReadableSource.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import java.io.IOException;
-
-import org.apache.crunch.Source;
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * An extension of the {@code Source} interface that indicates that a
- * {@code Source} instance may be read as a series of records by the client
- * code. This is used to determine whether a {@code PCollection} instance can be
- * materialized.
- */
-public interface ReadableSource<T> extends Source<T> {
-
- /**
- * Returns an {@code Iterable} that contains the contents of this source.
- *
- * @param conf The current {@code Configuration} instance
- * @return the contents of this {@code Source} as an {@code Iterable} instance
- * @throws IOException
- */
- Iterable<T> read(Configuration conf) throws IOException;
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/ReadableSourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/ReadableSourceTarget.java b/crunch/src/main/java/org/apache/crunch/io/ReadableSourceTarget.java
deleted file mode 100644
index 95c90aa..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/ReadableSourceTarget.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import org.apache.crunch.SourceTarget;
-
-/**
- * An interface that indicates that a {@code SourceTarget} instance can be read
- * into the local client.
- *
- * @param <T>
- * The type of data read.
- */
-public interface ReadableSourceTarget<T> extends ReadableSource<T>, SourceTarget<T> {
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/SequentialFileNamingScheme.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/SequentialFileNamingScheme.java b/crunch/src/main/java/org/apache/crunch/io/SequentialFileNamingScheme.java
deleted file mode 100644
index bdda8e6..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/SequentialFileNamingScheme.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import java.io.IOException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-
-/**
- * Default {@link FileNamingScheme} that uses an incrementing sequence number in
- * order to generate unique file names.
- */
-public class SequentialFileNamingScheme implements FileNamingScheme {
-
- @Override
- public String getMapOutputName(Configuration configuration, Path outputDirectory) throws IOException {
- return getSequentialFileName(configuration, outputDirectory, "m");
- }
-
- @Override
- public String getReduceOutputName(Configuration configuration, Path outputDirectory, int partitionId)
- throws IOException {
- return getSequentialFileName(configuration, outputDirectory, "r");
- }
-
- private String getSequentialFileName(Configuration configuration, Path outputDirectory, String jobTypeName)
- throws IOException {
- FileSystem fileSystem = outputDirectory.getFileSystem(configuration);
- int fileSequenceNumber = fileSystem.listStatus(outputDirectory).length;
-
- return String.format("part-%s-%05d", jobTypeName, fileSequenceNumber);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/SourceTargetHelper.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/SourceTargetHelper.java b/crunch/src/main/java/org/apache/crunch/io/SourceTargetHelper.java
deleted file mode 100644
index f4400de..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/SourceTargetHelper.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import java.io.IOException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-
-/**
- * Functions for configuring the inputs/outputs of MapReduce jobs.
- *
- */
-public class SourceTargetHelper {
-
- public static long getPathSize(Configuration conf, Path path) throws IOException {
- return getPathSize(path.getFileSystem(conf), path);
- }
-
- public static long getPathSize(FileSystem fs, Path path) throws IOException {
- FileStatus[] stati = fs.globStatus(path);
- if (stati == null || stati.length == 0) {
- return -1L;
- }
- long size = 0;
- for (FileStatus status : stati) {
- size += fs.getContentSummary(status.getPath()).getLength();
- }
- return size;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/To.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/To.java b/crunch/src/main/java/org/apache/crunch/io/To.java
deleted file mode 100644
index d62d294..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/To.java
+++ /dev/null
@@ -1,153 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import org.apache.crunch.Target;
-import org.apache.crunch.io.avro.AvroFileTarget;
-import org.apache.crunch.io.impl.FileTargetImpl;
-import org.apache.crunch.io.seq.SeqFileTarget;
-import org.apache.crunch.io.text.TextFileTarget;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-
-/**
- * <p>Static factory methods for creating common {@link Target} types.</p>
- *
- * <p>The {@code To} class is intended to be used as part of a literate API
- * for writing the output of Crunch pipelines to common file types. We can use
- * the {@code Target} objects created by the factory methods in the {@code To}
- * class with either the {@code write} method on the {@code Pipeline} class or
- * the convenience {@code write} method on {@code PCollection} and {@code PTable}
- * instances.
- *
- * <code>
- * Pipeline pipeline = new MRPipeline(this.getClass());
- * ...
- * // Write a PCollection<String> to a text file:
- * PCollection<String> words = ...;
- * pipeline.write(words, To.textFile("/put/my/words/here"));
- *
- * // Write a PTable<Text, Text> to a sequence file:
- * PTable<Text, Text> textToText = ...;
- * textToText.write(To.sequenceFile("/words/to/words"));
- *
- * // Write a PCollection<MyAvroObject> to an Avro data file:
- * PCollection<MyAvroObject> objects = ...;
- * objects.write(To.avroFile("/my/avro/files"));
- *
- * // Write a PTable to a custom FileOutputFormat:
- * PTable<KeyWritable, ValueWritable> custom = ...;
- * pipeline.write(custom, To.formattedFile("/custom", MyFileFormat.class));
- * </code>
- * </p>
- */
-public class To {
-
- /**
- * Creates a {@code Target} at the given path name that writes data to
- * a custom {@code FileOutputFormat}.
- *
- * @param pathName The name of the path to write the data to on the filesystem
- * @param formatClass The {@code FileOutputFormat<K, V>} to write the data to
- * @return A new {@code Target} instance
- */
- public static <K extends Writable, V extends Writable> Target formattedFile(
- String pathName, Class<? extends FileOutputFormat<K, V>> formatClass) {
- return formattedFile(new Path(pathName), formatClass);
- }
-
- /**
- * Creates a {@code Target} at the given {@code Path} that writes data to
- * a custom {@code FileOutputFormat}.
- *
- * @param path The {@code Path} to write the data to
- * @param formatClass The {@code FileOutputFormat} to write the data to
- * @return A new {@code Target} instance
- */
- public static <K extends Writable, V extends Writable> Target formattedFile(
- Path path, Class<? extends FileOutputFormat<K, V>> formatClass) {
- return new FileTargetImpl(path, formatClass, new SequentialFileNamingScheme());
- }
-
- /**
- * Creates a {@code Target} at the given path name that writes data to
- * Avro files. The {@code PType} for the written data must be for Avro records.
- *
- * @param pathName The name of the path to write the data to on the filesystem
- * @return A new {@code Target} instance
- */
- public static Target avroFile(String pathName) {
- return avroFile(new Path(pathName));
- }
-
- /**
- * Creates a {@code Target} at the given {@code Path} that writes data to
- * Avro files. The {@code PType} for the written data must be for Avro records.
- *
- * @param path The {@code Path} to write the data to
- * @return A new {@code Target} instance
- */
- public static Target avroFile(Path path) {
- return new AvroFileTarget(path);
- }
-
- /**
- * Creates a {@code Target} at the given path name that writes data to
- * SequenceFiles.
- *
- * @param pathName The name of the path to write the data to on the filesystem
- * @return A new {@code Target} instance
- */
- public static Target sequenceFile(String pathName) {
- return sequenceFile(new Path(pathName));
- }
-
- /**
- * Creates a {@code Target} at the given {@code Path} that writes data to
- * SequenceFiles.
- *
- * @param path The {@code Path} to write the data to
- * @return A new {@code Target} instance
- */
- public static Target sequenceFile(Path path) {
- return new SeqFileTarget(path);
- }
-
- /**
- * Creates a {@code Target} at the given path name that writes data to
- * text files.
- *
- * @param pathName The name of the path to write the data to on the filesystem
- * @return A new {@code Target} instance
- */
- public static Target textFile(String pathName) {
- return textFile(new Path(pathName));
- }
-
- /**
- * Creates a {@code Target} at the given {@code Path} that writes data to
- * text files.
- *
- * @param path The {@code Path} to write the data to
- * @return A new {@code Target} instance
- */
- public static Target textFile(Path path) {
- return new TextFileTarget(path);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileReaderFactory.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileReaderFactory.java b/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileReaderFactory.java
deleted file mode 100644
index c8fe23a..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileReaderFactory.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.avro;
-
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.avro.Schema;
-import org.apache.avro.file.DataFileReader;
-import org.apache.avro.generic.GenericDatumReader;
-import org.apache.avro.io.DatumReader;
-import org.apache.avro.mapred.FsInput;
-import org.apache.avro.reflect.ReflectDatumReader;
-import org.apache.avro.specific.SpecificDatumReader;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.fn.IdentityFn;
-import org.apache.crunch.io.FileReaderFactory;
-import org.apache.crunch.io.impl.AutoClosingIterator;
-import org.apache.crunch.types.avro.AvroType;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-
-import com.google.common.collect.Iterators;
-import com.google.common.collect.UnmodifiableIterator;
-
-public class AvroFileReaderFactory<T> implements FileReaderFactory<T> {
-
- private static final Log LOG = LogFactory.getLog(AvroFileReaderFactory.class);
-
- private final DatumReader<T> recordReader;
- private final MapFn<T, T> mapFn;
-
- public AvroFileReaderFactory(AvroType<T> atype) {
- this.recordReader = createDatumReader(atype);
- this.mapFn = (MapFn<T, T>) atype.getInputMapFn();
- }
-
- public AvroFileReaderFactory(Schema schema) {
- this.recordReader = new GenericDatumReader<T>(schema);
- this.mapFn = IdentityFn.<T>getInstance();
- }
-
- static <T> DatumReader<T> createDatumReader(AvroType<T> avroType) {
- if (avroType.hasReflect()) {
- if (avroType.hasSpecific()) {
- Avros.checkCombiningSpecificAndReflectionSchemas();
- }
- return new ReflectDatumReader<T>(avroType.getSchema());
- } else if (avroType.hasSpecific()) {
- return new SpecificDatumReader<T>(avroType.getSchema());
- } else {
- return new GenericDatumReader<T>(avroType.getSchema());
- }
- }
-
- @Override
- public Iterator<T> read(FileSystem fs, final Path path) {
- this.mapFn.initialize();
- try {
- FsInput fsi = new FsInput(path, fs.getConf());
- final DataFileReader<T> reader = new DataFileReader<T>(fsi, recordReader);
- return new AutoClosingIterator<T>(reader, new UnmodifiableIterator<T>() {
- @Override
- public boolean hasNext() {
- return reader.hasNext();
- }
-
- @Override
- public T next() {
- return mapFn.map(reader.next());
- }
- });
- } catch (IOException e) {
- LOG.info("Could not read avro file at path: " + path, e);
- return Iterators.emptyIterator();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileSource.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileSource.java b/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileSource.java
deleted file mode 100644
index 15792bf..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileSource.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.avro;
-
-import java.io.IOException;
-
-import org.apache.avro.mapred.AvroJob;
-import org.apache.crunch.io.CompositePathIterable;
-import org.apache.crunch.io.FormatBundle;
-import org.apache.crunch.io.ReadableSource;
-import org.apache.crunch.io.impl.FileSourceImpl;
-import org.apache.crunch.types.avro.AvroInputFormat;
-import org.apache.crunch.types.avro.AvroType;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-
-public class AvroFileSource<T> extends FileSourceImpl<T> implements ReadableSource<T> {
-
- private static <S> FormatBundle getBundle(AvroType<S> ptype) {
- FormatBundle bundle = FormatBundle.forInput(AvroInputFormat.class)
- .set(AvroJob.INPUT_IS_REFLECT, String.valueOf(ptype.hasReflect()))
- .set(AvroJob.INPUT_SCHEMA, ptype.getSchema().toString())
- .set(Avros.REFLECT_DATA_FACTORY_CLASS, Avros.REFLECT_DATA_FACTORY.getClass().getName());
- return bundle;
- }
-
- public AvroFileSource(Path path, AvroType<T> ptype) {
- super(path, ptype, getBundle(ptype));
- }
-
- @Override
- public String toString() {
- return "Avro(" + path.toString() + ")";
- }
-
- @Override
- public Iterable<T> read(Configuration conf) throws IOException {
- FileSystem fs = path.getFileSystem(conf);
- return CompositePathIterable.create(fs, path, new AvroFileReaderFactory<T>((AvroType<T>) ptype));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileSourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileSourceTarget.java b/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileSourceTarget.java
deleted file mode 100644
index 76103e5..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileSourceTarget.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.avro;
-
-import org.apache.crunch.io.FileNamingScheme;
-import org.apache.crunch.io.SequentialFileNamingScheme;
-import org.apache.crunch.io.impl.ReadableSourcePathTargetImpl;
-import org.apache.crunch.types.avro.AvroType;
-import org.apache.hadoop.fs.Path;
-
-public class AvroFileSourceTarget<T> extends ReadableSourcePathTargetImpl<T> {
- public AvroFileSourceTarget(Path path, AvroType<T> atype) {
- this(path, atype, new SequentialFileNamingScheme());
- }
-
- public AvroFileSourceTarget(Path path, AvroType<T> atype, FileNamingScheme fileNamingScheme) {
- super(new AvroFileSource<T>(path, atype), new AvroFileTarget(path), fileNamingScheme);
- }
-
- @Override
- public String toString() {
- return target.toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileTarget.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileTarget.java b/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileTarget.java
deleted file mode 100644
index 3a9e42c..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/avro/AvroFileTarget.java
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.avro;
-
-import org.apache.avro.mapred.AvroWrapper;
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.io.FileNamingScheme;
-import org.apache.crunch.io.OutputHandler;
-import org.apache.crunch.io.SequentialFileNamingScheme;
-import org.apache.crunch.io.impl.FileTargetImpl;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.avro.AvroOutputFormat;
-import org.apache.crunch.types.avro.AvroType;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.mapreduce.Job;
-
-public class AvroFileTarget extends FileTargetImpl {
-
- public AvroFileTarget(String path) {
- this(new Path(path));
- }
-
- public AvroFileTarget(Path path) {
- this(path, new SequentialFileNamingScheme());
- }
-
- public AvroFileTarget(Path path, FileNamingScheme fileNamingScheme) {
- super(path, AvroOutputFormat.class, fileNamingScheme);
- }
-
- @Override
- public String toString() {
- return "Avro(" + path.toString() + ")";
- }
-
- @Override
- public boolean accept(OutputHandler handler, PType<?> ptype) {
- if (!(ptype instanceof AvroType)) {
- return false;
- }
- handler.configure(this, ptype);
- return true;
- }
-
- @Override
- public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) {
- AvroType<?> atype = (AvroType<?>) ptype;
- Configuration conf = job.getConfiguration();
- String schemaParam = null;
- if (name == null) {
- schemaParam = "avro.output.schema";
- } else {
- schemaParam = "avro.output.schema." + name;
- }
- String outputSchema = conf.get(schemaParam);
- if (outputSchema == null) {
- conf.set(schemaParam, atype.getSchema().toString());
- } else if (!outputSchema.equals(atype.getSchema().toString())) {
- throw new IllegalStateException("Avro targets must use the same output schema");
- }
- Avros.configureReflectDataFactory(conf);
- configureForMapReduce(job, AvroWrapper.class, NullWritable.class, AvroOutputFormat.class,
- outputPath, name);
- }
-
- @Override
- public <T> SourceTarget<T> asSourceTarget(PType<T> ptype) {
- if (ptype instanceof AvroType) {
- return new AvroFileSourceTarget<T>(path, (AvroType<T>) ptype);
- }
- return null;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/impl/AutoClosingIterator.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/impl/AutoClosingIterator.java b/crunch/src/main/java/org/apache/crunch/io/impl/AutoClosingIterator.java
deleted file mode 100644
index 3bd802e..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/impl/AutoClosingIterator.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.impl;
-
-import java.io.Closeable;
-import java.io.IOException;
-import java.util.Iterator;
-
-import com.google.common.collect.UnmodifiableIterator;
-import com.google.common.io.Closeables;
-
-/**
- * Closes the wrapped {@code Closeable} when {@link #hasNext()} returns false. As long a client loops through to
- * completion (doesn't abort early due to an exception, short circuit, etc.) resources will be closed automatically.
- */
-public class AutoClosingIterator<T> extends UnmodifiableIterator<T> implements Closeable {
- private final Iterator<T> iter;
- private Closeable closeable;
-
- public AutoClosingIterator(Closeable closeable, Iterator<T> iter) {
- this.closeable = closeable;
- this.iter = iter;
- }
-
- @Override
- public boolean hasNext() {
- if (!iter.hasNext()) {
- Closeables.closeQuietly(this);
- return false;
- } else {
- return true;
- }
- }
-
- @Override
- public T next() {
- return iter.next();
- }
-
- @Override
- public void close() throws IOException {
- if (closeable != null) {
- closeable.close();
- closeable = null;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/impl/FileSourceImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/impl/FileSourceImpl.java b/crunch/src/main/java/org/apache/crunch/io/impl/FileSourceImpl.java
deleted file mode 100644
index 688c801..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/impl/FileSourceImpl.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.impl;
-
-import java.io.IOException;
-
-import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.Source;
-import org.apache.crunch.io.CrunchInputs;
-import org.apache.crunch.io.FormatBundle;
-import org.apache.crunch.io.SourceTargetHelper;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.InputFormat;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-
-public class FileSourceImpl<T> implements Source<T> {
-
- private static final Log LOG = LogFactory.getLog(FileSourceImpl.class);
-
- protected final Path path;
- protected final PType<T> ptype;
- protected final FormatBundle<? extends InputFormat> inputBundle;
-
- public FileSourceImpl(Path path, PType<T> ptype, Class<? extends InputFormat> inputFormatClass) {
- this.path = path;
- this.ptype = ptype;
- this.inputBundle = FormatBundle.forInput(inputFormatClass);
- }
-
- public FileSourceImpl(Path path, PType<T> ptype, FormatBundle<? extends InputFormat> inputBundle) {
- this.path = path;
- this.ptype = ptype;
- this.inputBundle = inputBundle;
- }
-
- public Path getPath() {
- return path;
- }
-
- @Override
- public void configureSource(Job job, int inputId) throws IOException {
- if (inputId == -1) {
- FileInputFormat.addInputPath(job, path);
- job.setInputFormatClass(inputBundle.getFormatClass());
- inputBundle.configure(job.getConfiguration());
- } else {
- CrunchInputs.addInputPath(job, path, inputBundle, inputId);
- }
- }
-
- @Override
- public PType<T> getType() {
- return ptype;
- }
-
- @Override
- public long getSize(Configuration configuration) {
- try {
- return SourceTargetHelper.getPathSize(configuration, path);
- } catch (IOException e) {
- LOG.warn(String.format("Exception thrown looking up size of: %s", path), e);
- throw new IllegalStateException("Failed to get the file size of:" + path, e);
- }
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !getClass().equals(other.getClass())) {
- return false;
- }
- FileSourceImpl o = (FileSourceImpl) other;
- return ptype.equals(o.ptype) && path.equals(o.path) && inputBundle.equals(o.inputBundle);
- }
-
- @Override
- public int hashCode() {
- return new HashCodeBuilder().append(ptype).append(path).append(inputBundle).toHashCode();
- }
-
- @Override
- public String toString() {
- return new StringBuilder().append(inputBundle.getName()).append("(").append(path).append(")").toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/impl/FileTableSourceImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/impl/FileTableSourceImpl.java b/crunch/src/main/java/org/apache/crunch/io/impl/FileTableSourceImpl.java
deleted file mode 100644
index 295edb5..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/impl/FileTableSourceImpl.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.impl;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.TableSource;
-import org.apache.crunch.io.FormatBundle;
-import org.apache.crunch.types.PTableType;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-
-public class FileTableSourceImpl<K, V> extends FileSourceImpl<Pair<K, V>> implements TableSource<K, V> {
-
- public FileTableSourceImpl(Path path, PTableType<K, V> tableType, Class<? extends FileInputFormat> formatClass) {
- super(path, tableType, formatClass);
- }
-
- public FileTableSourceImpl(Path path, PTableType<K, V> tableType, FormatBundle bundle) {
- super(path, tableType, bundle);
- }
-
- @Override
- public PTableType<K, V> getTableType() {
- return (PTableType<K, V>) getType();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/impl/FileTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/impl/FileTargetImpl.java b/crunch/src/main/java/org/apache/crunch/io/impl/FileTargetImpl.java
deleted file mode 100644
index c1c29e4..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/impl/FileTargetImpl.java
+++ /dev/null
@@ -1,162 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.impl;
-
-import java.io.IOException;
-
-import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.io.CrunchOutputs;
-import org.apache.crunch.io.FileNamingScheme;
-import org.apache.crunch.io.OutputHandler;
-import org.apache.crunch.io.PathTarget;
-import org.apache.crunch.types.Converter;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-
-public class FileTargetImpl implements PathTarget {
-
- private static final Log LOG = LogFactory.getLog(FileTargetImpl.class);
-
- protected final Path path;
- private final Class<? extends FileOutputFormat> outputFormatClass;
- private final FileNamingScheme fileNamingScheme;
-
- public FileTargetImpl(Path path, Class<? extends FileOutputFormat> outputFormatClass,
- FileNamingScheme fileNamingScheme) {
- this.path = path;
- this.outputFormatClass = outputFormatClass;
- this.fileNamingScheme = fileNamingScheme;
- }
-
- @Override
- public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) {
- Converter converter = ptype.getConverter();
- Class keyClass = converter.getKeyClass();
- Class valueClass = converter.getValueClass();
- configureForMapReduce(job, keyClass, valueClass, outputFormatClass, outputPath, name);
- }
-
- protected void configureForMapReduce(Job job, Class keyClass, Class valueClass,
- Class outputFormatClass, Path outputPath, String name) {
- try {
- FileOutputFormat.setOutputPath(job, outputPath);
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- if (name == null) {
- job.setOutputFormatClass(outputFormatClass);
- job.setOutputKeyClass(keyClass);
- job.setOutputValueClass(valueClass);
- } else {
- CrunchOutputs.addNamedOutput(job, name, outputFormatClass, keyClass, valueClass);
- }
- }
-
- @Override
- public boolean accept(OutputHandler handler, PType<?> ptype) {
- handler.configure(this, ptype);
- return true;
- }
-
- @Override
- public Path getPath() {
- return path;
- }
-
- @Override
- public FileNamingScheme getFileNamingScheme() {
- return fileNamingScheme;
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !getClass().equals(other.getClass())) {
- return false;
- }
- FileTargetImpl o = (FileTargetImpl) other;
- return path.equals(o.path);
- }
-
- @Override
- public int hashCode() {
- return new HashCodeBuilder().append(path).toHashCode();
- }
-
- @Override
- public String toString() {
- return new StringBuilder().append(outputFormatClass.getSimpleName()).append("(").append(path).append(")")
- .toString();
- }
-
- @Override
- public <T> SourceTarget<T> asSourceTarget(PType<T> ptype) {
- // By default, assume that we cannot do this.
- return null;
- }
-
- @Override
- public void handleExisting(WriteMode strategy, Configuration conf) {
- FileSystem fs = null;
- try {
- fs = FileSystem.get(conf);
- } catch (IOException e) {
- LOG.error("Could not retrieve FileSystem object to check for existing path", e);
- throw new CrunchRuntimeException(e);
- }
-
- boolean exists = false;
- try {
- exists = fs.exists(path);
- } catch (IOException e) {
- LOG.error("Exception checking existence of path: " + path, e);
- throw new CrunchRuntimeException(e);
- }
-
- if (exists) {
- switch (strategy) {
- case DEFAULT:
- LOG.error("Path " + path + " already exists!");
- throw new CrunchRuntimeException("Path already exists: " + path);
- case OVERWRITE:
- LOG.info("Removing data at existing path: " + path);
- try {
- fs.delete(path, true);
- } catch (IOException e) {
- LOG.error("Exception thrown removing data at path: " + path, e);
- }
- break;
- case APPEND:
- LOG.info("Adding output files to existing path: " + path);
- break;
- default:
- throw new CrunchRuntimeException("Unknown WriteMode: " + strategy);
- }
- } else {
- LOG.info("Will write output files to new path: " + path);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/impl/ReadableSourcePathTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/impl/ReadableSourcePathTargetImpl.java b/crunch/src/main/java/org/apache/crunch/io/impl/ReadableSourcePathTargetImpl.java
deleted file mode 100644
index 6506816..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/impl/ReadableSourcePathTargetImpl.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.impl;
-
-import java.io.IOException;
-
-import org.apache.crunch.io.FileNamingScheme;
-import org.apache.crunch.io.PathTarget;
-import org.apache.crunch.io.ReadableSource;
-import org.apache.crunch.io.ReadableSourceTarget;
-import org.apache.hadoop.conf.Configuration;
-
-public class ReadableSourcePathTargetImpl<T> extends SourcePathTargetImpl<T> implements ReadableSourceTarget<T> {
-
- public ReadableSourcePathTargetImpl(ReadableSource<T> source, PathTarget target, FileNamingScheme fileNamingScheme) {
- super(source, target, fileNamingScheme);
- }
-
- @Override
- public Iterable<T> read(Configuration conf) throws IOException {
- return ((ReadableSource<T>) source).read(conf);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/impl/ReadableSourceTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/impl/ReadableSourceTargetImpl.java b/crunch/src/main/java/org/apache/crunch/io/impl/ReadableSourceTargetImpl.java
deleted file mode 100644
index f435b3b..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/impl/ReadableSourceTargetImpl.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.impl;
-
-import java.io.IOException;
-
-import org.apache.crunch.Target;
-import org.apache.crunch.io.ReadableSource;
-import org.apache.crunch.io.ReadableSourceTarget;
-import org.apache.hadoop.conf.Configuration;
-
-public class ReadableSourceTargetImpl<T> extends SourceTargetImpl<T> implements ReadableSourceTarget<T> {
-
- public ReadableSourceTargetImpl(ReadableSource<T> source, Target target) {
- super(source, target);
- }
-
- @Override
- public Iterable<T> read(Configuration conf) throws IOException {
- return ((ReadableSource<T>) source).read(conf);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/impl/SourcePathTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/impl/SourcePathTargetImpl.java b/crunch/src/main/java/org/apache/crunch/io/impl/SourcePathTargetImpl.java
deleted file mode 100644
index c0d7ce0..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/impl/SourcePathTargetImpl.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.impl;
-
-import org.apache.crunch.Source;
-import org.apache.crunch.io.FileNamingScheme;
-import org.apache.crunch.io.PathTarget;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-
-public class SourcePathTargetImpl<T> extends SourceTargetImpl<T> implements PathTarget {
-
- private final FileNamingScheme fileNamingScheme;
-
- public SourcePathTargetImpl(Source<T> source, PathTarget target, FileNamingScheme fileNamingScheme) {
- super(source, target);
- this.fileNamingScheme = fileNamingScheme;
- }
-
- @Override
- public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) {
- ((PathTarget) target).configureForMapReduce(job, ptype, outputPath, name);
- }
-
- @Override
- public Path getPath() {
- return ((PathTarget) target).getPath();
- }
-
- @Override
- public FileNamingScheme getFileNamingScheme() {
- return fileNamingScheme;
- }
-}
[19/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/join/MultiAvroSchemaJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/join/MultiAvroSchemaJoinIT.java b/crunch/src/it/java/org/apache/crunch/lib/join/MultiAvroSchemaJoinIT.java
deleted file mode 100644
index f1ca770..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/join/MultiAvroSchemaJoinIT.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import static org.apache.crunch.types.avro.Avros.records;
-import static org.apache.crunch.types.avro.Avros.strings;
-import static org.junit.Assert.assertEquals;
-
-import java.io.File;
-import java.util.List;
-
-import org.apache.avro.Schema;
-import org.apache.avro.file.DataFileWriter;
-import org.apache.avro.io.DatumWriter;
-import org.apache.avro.specific.SpecificDatumWriter;
-import org.apache.avro.specific.SpecificRecord;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.From;
-import org.apache.crunch.test.Employee;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-
-public class MultiAvroSchemaJoinIT {
-
- private File personFile;
- private File employeeFile;
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Before
- public void setUp() throws Exception {
- this.personFile = File.createTempFile("person", ".avro");
- this.employeeFile = File.createTempFile("employee", ".avro");
-
- DatumWriter<Person> pdw = new SpecificDatumWriter<Person>();
- DataFileWriter<Person> pfw = new DataFileWriter<Person>(pdw);
- pfw.create(Person.SCHEMA$, personFile);
- Person p1 = new Person();
- p1.name = "Josh";
- p1.age = 19;
- p1.siblingnames = ImmutableList.<CharSequence> of("Kate", "Mike");
- pfw.append(p1);
- Person p2 = new Person();
- p2.name = "Kate";
- p2.age = 17;;
- p2.siblingnames = ImmutableList.<CharSequence> of("Josh", "Mike");
- pfw.append(p2);
- Person p3 = new Person();
- p3.name = "Mike";
- p3.age = 12;
- p3.siblingnames = ImmutableList.<CharSequence> of("Josh", "Kate");
- pfw.append(p3);
- pfw.close();
-
- DatumWriter<Employee> edw = new SpecificDatumWriter<Employee>();
- DataFileWriter<Employee> efw = new DataFileWriter<Employee>(edw);
- efw.create(Employee.SCHEMA$, employeeFile);
- Employee e1 = new Employee();
- e1.name = "Kate";
- e1.salary = 100000;
- e1.department = "Marketing";
- efw.append(e1);
- efw.close();
- }
-
- @After
- public void tearDown() throws Exception {
- personFile.delete();
- employeeFile.delete();
- }
-
- public static class NameFn<K extends SpecificRecord> extends MapFn<K, String> {
- @Override
- public String map(K input) {
- Schema s = input.getSchema();
- Schema.Field f = s.getField("name");
- return input.get(f.pos()).toString();
- }
- }
-
- @Test
- public void testJoin() throws Exception {
- Pipeline p = new MRPipeline(MultiAvroSchemaJoinIT.class, tmpDir.getDefaultConfiguration());
- PCollection<Person> people = p.read(From.avroFile(personFile.getAbsolutePath(), records(Person.class)));
- PCollection<Employee> employees = p.read(From.avroFile(employeeFile.getAbsolutePath(), records(Employee.class)));
-
- Iterable<Pair<Person, Employee>> result = people.by(new NameFn<Person>(), strings())
- .join(employees.by(new NameFn<Employee>(), strings())).values().materialize();
- List<Pair<Person, Employee>> v = Lists.newArrayList(result);
- assertEquals(1, v.size());
- assertEquals("Kate", v.get(0).first().name.toString());
- assertEquals("Kate", v.get(0).second().name.toString());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/lib/join/RightOuterJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/lib/join/RightOuterJoinIT.java b/crunch/src/it/java/org/apache/crunch/lib/join/RightOuterJoinIT.java
deleted file mode 100644
index d889b61..0000000
--- a/crunch/src/it/java/org/apache/crunch/lib/join/RightOuterJoinIT.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import static org.junit.Assert.assertTrue;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.PTypeFamily;
-
-public class RightOuterJoinIT extends JoinTester {
- @Override
- public void assertPassed(Iterable<Pair<String, Long>> lines) {
- boolean passed1 = false;
- boolean passed2 = true;
- boolean passed3 = false;
- for (Pair<String, Long> line : lines) {
- if ("wretched".equals(line.first()) && 24 == line.second()) {
- passed1 = true;
- }
- if ("againe".equals(line.first())) {
- passed2 = false;
- }
- if ("Montparnasse.".equals(line.first()) && 2 == line.second()) {
- passed3 = true;
- }
- }
- assertTrue(passed1);
- assertTrue(passed2);
- assertTrue(passed3);
- }
-
- @Override
- protected JoinFn<String, Long, Long> getJoinFn(PTypeFamily typeFamily) {
- return new RightOuterJoinFn<String, Long, Long>(typeFamily.strings(), typeFamily.longs());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/test/TemporaryPaths.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/test/TemporaryPaths.java b/crunch/src/it/java/org/apache/crunch/test/TemporaryPaths.java
deleted file mode 100644
index 97cf0de..0000000
--- a/crunch/src/it/java/org/apache/crunch/test/TemporaryPaths.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.test;
-
-import org.apache.crunch.impl.mr.run.RuntimeParameters;
-import org.apache.hadoop.conf.Configuration;
-
-
-/**
- * Utilities for working with {@link TemporaryPath}.
- */
-public final class TemporaryPaths {
-
- /**
- * Static factory returning a {@link TemporaryPath} with adjusted
- * {@link Configuration} properties.
- */
- public static TemporaryPath create() {
- return new TemporaryPath(RuntimeParameters.TMP_DIR, "hadoop.tmp.dir");
- }
-
- private TemporaryPaths() {
- // nothing
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/test/Tests.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/test/Tests.java b/crunch/src/it/java/org/apache/crunch/test/Tests.java
deleted file mode 100644
index e381c1a..0000000
--- a/crunch/src/it/java/org/apache/crunch/test/Tests.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.test;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkNotNull;
-
-import java.io.IOException;
-import java.util.Collection;
-
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.hadoop.io.Writable;
-import org.junit.runners.Parameterized.Parameters;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.io.ByteArrayDataOutput;
-import com.google.common.io.ByteStreams;
-import com.google.common.io.Resources;
-
-
-/**
- * Utilities for integration tests.
- */
-public final class Tests {
-
- private Tests() {
- // nothing
- }
-
- /**
- * Get the path to and integration test resource file, as per naming convention.
- *
- * @param testCase The executing test case instance
- * @param resourceName The file name of the resource
- * @return The path to the resource (never null)
- * @throws IllegalArgumentException Thrown if the resource doesn't exist
- */
- public static String pathTo(Object testCase, String resourceName) {
- String qualifiedName = resource(testCase, resourceName);
- return Resources.getResource(qualifiedName).getFile();
- }
-
- /**
- * This doesn't check whether the resource exists!
- *
- * @param testCase
- * @param resourceName
- * @return The path to the resource (never null)
- */
- public static String resource(Object testCase, String resourceName) {
- checkNotNull(testCase);
- checkNotNull(resourceName);
-
- // Note: We append "Data" because otherwise Eclipse would complain about the
- // the case's class name clashing with the resource directory's name.
- return testCase.getClass().getName().replaceAll("\\.", "/") + "Data/" + resourceName;
- }
-
- /**
- * Return our two types of {@link Pipeline}s for a JUnit Parameterized test.
- *
- * @param testCase The executing test case's class
- * @return The collection to return from a {@link Parameters} provider method
- */
- public static Collection<Object[]> pipelinesParams(Class<?> testCase) {
- return ImmutableList.copyOf(
- new Object[][] { { MemPipeline.getInstance() }, { new MRPipeline(testCase) }
- });
- }
-
- /**
- * Serialize the given Writable into a byte array.
- *
- * @param value The instance to serialize
- * @return The serialized data
- */
- public static byte[] serialize(Writable value) {
- checkNotNull(value);
- try {
- ByteArrayDataOutput out = ByteStreams.newDataOutput();
- value.write(out);
- return out.toByteArray();
- } catch (IOException e) {
- throw new IllegalStateException("cannot serialize", e);
- }
- }
-
- /**
- * Serialize the src Writable into a byte array, then deserialize it into dest.
- * @param src The instance to serialize
- * @param dest The instance to deserialize into
- * @return dest, for convenience
- */
- public static <T extends Writable> T roundtrip(Writable src, T dest) {
- checkNotNull(src);
- checkNotNull(dest);
- checkArgument(src != dest, "src and dest may not be the same instance");
-
- try {
- byte[] data = serialize(src);
- dest.readFields(ByteStreams.newDataInput(data));
- } catch (IOException e) {
- throw new IllegalStateException("cannot deserialize", e);
- }
- return dest;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/customers.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/customers.txt b/crunch/src/it/resources/customers.txt
deleted file mode 100644
index 98f3f3d..0000000
--- a/crunch/src/it/resources/customers.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-111|John Doe
-222|Jane Doe
-333|Someone Else
-444|Has No Orders
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/docs.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/docs.txt b/crunch/src/it/resources/docs.txt
deleted file mode 100644
index 90a3f65..0000000
--- a/crunch/src/it/resources/docs.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-A this doc has this text
-A and this text as well
-A but also this
-B this doc has some text
-B but not as much as the last
-B doc
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/emptyTextFile.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/emptyTextFile.txt b/crunch/src/it/resources/emptyTextFile.txt
deleted file mode 100644
index e69de29..0000000
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/letters.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/letters.txt b/crunch/src/it/resources/letters.txt
deleted file mode 100644
index 916bfc9..0000000
--- a/crunch/src/it/resources/letters.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-a
-bb
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/log4j.properties b/crunch/src/it/resources/log4j.properties
deleted file mode 100644
index 5d144a0..0000000
--- a/crunch/src/it/resources/log4j.properties
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# ***** Set root logger level to INFO and its only appender to A.
-log4j.logger.org.apache.crunch=info, A
-
-# Log warnings on Hadoop for the local runner when testing
-log4j.logger.org.apache.hadoop=warn, A
-# Except for Configuration, which is chatty.
-log4j.logger.org.apache.hadoop.conf.Configuration=error, A
-
-# ***** A is set to be a ConsoleAppender.
-log4j.appender.A=org.apache.log4j.ConsoleAppender
-# ***** A uses PatternLayout.
-log4j.appender.A.layout=org.apache.log4j.PatternLayout
-log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
[26/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/Avros.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/Avros.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/Avros.java
new file mode 100644
index 0000000..fc30eaf
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/Avros.java
@@ -0,0 +1,709 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
+
+import org.apache.avro.Schema;
+import org.apache.avro.Schema.Type;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.reflect.ReflectData;
+import org.apache.avro.specific.SpecificRecord;
+import org.apache.avro.util.Utf8;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Tuple;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+import org.apache.crunch.fn.CompositeMapFn;
+import org.apache.crunch.fn.IdentityFn;
+import org.apache.crunch.types.CollectionDeepCopier;
+import org.apache.crunch.types.DeepCopier;
+import org.apache.crunch.types.MapDeepCopier;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypes;
+import org.apache.crunch.types.TupleDeepCopier;
+import org.apache.crunch.types.TupleFactory;
+import org.apache.crunch.types.writable.WritableDeepCopier;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+/**
+ * Defines static methods that are analogous to the methods defined in
+ * {@link AvroTypeFamily} for convenient static importing.
+ *
+ */
+public class Avros {
+
+ /**
+ * Older versions of Avro (i.e., before 1.7.0) do not support schemas that are
+ * composed of a mix of specific and reflection-based schemas. This bit
+ * controls whether or not we allow Crunch jobs to be created that involve
+ * mixing specific and reflection-based schemas and can be overridden by the
+ * client developer.
+ */
+ public static final boolean CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS;
+
+ static {
+ CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS = AvroCapabilities.canDecodeSpecificSchemaWithReflectDatumReader();
+ }
+
+ /**
+ * The instance we use for generating reflected schemas. May be modified by
+ * clients (e.g., Scrunch.)
+ */
+ public static ReflectDataFactory REFLECT_DATA_FACTORY = new ReflectDataFactory();
+
+ /**
+ * The name of the configuration parameter that tracks which reflection
+ * factory to use.
+ */
+ public static final String REFLECT_DATA_FACTORY_CLASS = "crunch.reflectdatafactory";
+
+ public static void configureReflectDataFactory(Configuration conf) {
+ conf.setClass(REFLECT_DATA_FACTORY_CLASS, REFLECT_DATA_FACTORY.getClass(), ReflectDataFactory.class);
+ }
+
+ public static ReflectDataFactory getReflectDataFactory(Configuration conf) {
+ return (ReflectDataFactory) ReflectionUtils.newInstance(
+ conf.getClass(REFLECT_DATA_FACTORY_CLASS, ReflectDataFactory.class), conf);
+ }
+
+ public static void checkCombiningSpecificAndReflectionSchemas() {
+ if (!CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS) {
+ throw new IllegalStateException("Crunch does not support running jobs that"
+ + " contain a mixture of reflection-based and avro-generated data types."
+ + " Please consider turning your reflection-based type into an avro-generated"
+ + " type and using that generated type instead."
+ + " If the version of Avro you are using is 1.7.0 or greater, you can enable"
+ + " combined schemas by setting the Avros.CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS" + " field to 'true'.");
+ }
+ }
+
+ public static MapFn<CharSequence, String> UTF8_TO_STRING = new MapFn<CharSequence, String>() {
+ @Override
+ public String map(CharSequence input) {
+ return input.toString();
+ }
+ };
+
+ public static MapFn<String, Utf8> STRING_TO_UTF8 = new MapFn<String, Utf8>() {
+ @Override
+ public Utf8 map(String input) {
+ return new Utf8(input);
+ }
+ };
+
+ public static MapFn<Object, ByteBuffer> BYTES_IN = new MapFn<Object, ByteBuffer>() {
+ @Override
+ public ByteBuffer map(Object input) {
+ if (input instanceof ByteBuffer) {
+ return (ByteBuffer) input;
+ }
+ return ByteBuffer.wrap((byte[]) input);
+ }
+ };
+
+ private static final AvroType<String> strings = new AvroType<String>(String.class, Schema.create(Schema.Type.STRING),
+ UTF8_TO_STRING, STRING_TO_UTF8, new DeepCopier.NoOpDeepCopier<String>());
+ private static final AvroType<Void> nulls = create(Void.class, Schema.Type.NULL);
+ private static final AvroType<Long> longs = create(Long.class, Schema.Type.LONG);
+ private static final AvroType<Integer> ints = create(Integer.class, Schema.Type.INT);
+ private static final AvroType<Float> floats = create(Float.class, Schema.Type.FLOAT);
+ private static final AvroType<Double> doubles = create(Double.class, Schema.Type.DOUBLE);
+ private static final AvroType<Boolean> booleans = create(Boolean.class, Schema.Type.BOOLEAN);
+ private static final AvroType<ByteBuffer> bytes = new AvroType<ByteBuffer>(ByteBuffer.class,
+ Schema.create(Schema.Type.BYTES), BYTES_IN, IdentityFn.getInstance(), new DeepCopier.NoOpDeepCopier<ByteBuffer>());
+
+ private static final Map<Class<?>, PType<?>> PRIMITIVES = ImmutableMap.<Class<?>, PType<?>> builder()
+ .put(String.class, strings).put(Long.class, longs).put(Integer.class, ints).put(Float.class, floats)
+ .put(Double.class, doubles).put(Boolean.class, booleans).put(ByteBuffer.class, bytes).build();
+
+ private static final Map<Class<?>, AvroType<?>> EXTENSIONS = Maps.newHashMap();
+
+ public static <T> void register(Class<T> clazz, AvroType<T> ptype) {
+ EXTENSIONS.put(clazz, ptype);
+ }
+
+ public static <T> PType<T> getPrimitiveType(Class<T> clazz) {
+ return (PType<T>) PRIMITIVES.get(clazz);
+ }
+
+ static <T> boolean isPrimitive(AvroType<T> avroType) {
+ return avroType.getTypeClass().isPrimitive() || PRIMITIVES.containsKey(avroType.getTypeClass());
+ }
+
+ private static <T> AvroType<T> create(Class<T> clazz, Schema.Type schemaType) {
+ return new AvroType<T>(clazz, Schema.create(schemaType), new DeepCopier.NoOpDeepCopier<T>());
+ }
+
+ public static final AvroType<Void> nulls() {
+ return nulls;
+ }
+
+ public static final AvroType<String> strings() {
+ return strings;
+ }
+
+ public static final AvroType<Long> longs() {
+ return longs;
+ }
+
+ public static final AvroType<Integer> ints() {
+ return ints;
+ }
+
+ public static final AvroType<Float> floats() {
+ return floats;
+ }
+
+ public static final AvroType<Double> doubles() {
+ return doubles;
+ }
+
+ public static final AvroType<Boolean> booleans() {
+ return booleans;
+ }
+
+ public static final AvroType<ByteBuffer> bytes() {
+ return bytes;
+ }
+
+ public static final <T> AvroType<T> records(Class<T> clazz) {
+ if (EXTENSIONS.containsKey(clazz)) {
+ return (AvroType<T>) EXTENSIONS.get(clazz);
+ }
+ return containers(clazz);
+ }
+
+ public static final AvroType<GenericData.Record> generics(Schema schema) {
+ return new AvroType<GenericData.Record>(GenericData.Record.class, schema, new AvroDeepCopier.AvroGenericDeepCopier(
+ schema));
+ }
+
+ public static final <T> AvroType<T> containers(Class<T> clazz) {
+ if (SpecificRecord.class.isAssignableFrom(clazz)) {
+ return (AvroType<T>) specifics((Class<SpecificRecord>) clazz);
+ }
+ return reflects(clazz);
+ }
+
+ public static final <T extends SpecificRecord> AvroType<T> specifics(Class<T> clazz) {
+ T t = ReflectionUtils.newInstance(clazz, null);
+ Schema schema = t.getSchema();
+ return new AvroType<T>(clazz, schema, new AvroDeepCopier.AvroSpecificDeepCopier<T>(clazz, schema));
+ }
+
+ public static final <T> AvroType<T> reflects(Class<T> clazz) {
+ Schema schema = REFLECT_DATA_FACTORY.getReflectData().getSchema(clazz);
+ return new AvroType<T>(clazz, schema, new AvroDeepCopier.AvroReflectDeepCopier<T>(clazz, schema));
+ }
+
+ private static class BytesToWritableMapFn<T extends Writable> extends MapFn<Object, T> {
+ private static final Log LOG = LogFactory.getLog(BytesToWritableMapFn.class);
+
+ private final Class<T> writableClazz;
+
+ public BytesToWritableMapFn(Class<T> writableClazz) {
+ this.writableClazz = writableClazz;
+ }
+
+ @Override
+ public T map(Object input) {
+ ByteBuffer byteBuffer = BYTES_IN.map(input);
+ T instance = ReflectionUtils.newInstance(writableClazz, null);
+ try {
+ instance.readFields(new DataInputStream(new ByteArrayInputStream(byteBuffer.array(),
+ byteBuffer.arrayOffset(), byteBuffer.limit())));
+ } catch (IOException e) {
+ LOG.error("Exception thrown reading instance of: " + writableClazz, e);
+ }
+ return instance;
+ }
+ }
+
+ private static class WritableToBytesMapFn<T extends Writable> extends MapFn<T, ByteBuffer> {
+ private static final Log LOG = LogFactory.getLog(WritableToBytesMapFn.class);
+
+ @Override
+ public ByteBuffer map(T input) {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutputStream das = new DataOutputStream(baos);
+ try {
+ input.write(das);
+ } catch (IOException e) {
+ LOG.error("Exception thrown converting Writable to bytes", e);
+ }
+ return ByteBuffer.wrap(baos.toByteArray());
+ }
+ }
+
+ public static final <T extends Writable> AvroType<T> writables(Class<T> clazz) {
+ return new AvroType<T>(clazz, Schema.create(Schema.Type.BYTES), new BytesToWritableMapFn<T>(clazz),
+ new WritableToBytesMapFn<T>(), new WritableDeepCopier<T>(clazz));
+ }
+
+ private static class GenericDataArrayToCollection<T> extends MapFn<Object, Collection<T>> {
+
+ private final MapFn<Object, T> mapFn;
+
+ public GenericDataArrayToCollection(MapFn<Object, T> mapFn) {
+ this.mapFn = mapFn;
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ mapFn.configure(conf);
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ mapFn.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ mapFn.initialize();
+ }
+
+ @Override
+ public Collection<T> map(Object input) {
+ Collection<T> ret = Lists.newArrayList();
+ if (input instanceof Collection) {
+ for (Object in : (Collection<Object>) input) {
+ ret.add(mapFn.map(in));
+ }
+ } else {
+ // Assume it is an array
+ Object[] arr = (Object[]) input;
+ for (Object in : arr) {
+ ret.add(mapFn.map(in));
+ }
+ }
+ return ret;
+ }
+ }
+
+ private static class CollectionToGenericDataArray extends MapFn<Collection<?>, GenericData.Array<?>> {
+
+ private final MapFn mapFn;
+ private final String jsonSchema;
+ private transient Schema schema;
+
+ public CollectionToGenericDataArray(Schema schema, MapFn mapFn) {
+ this.mapFn = mapFn;
+ this.jsonSchema = schema.toString();
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ mapFn.configure(conf);
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ mapFn.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ mapFn.initialize();
+ }
+
+ @Override
+ public GenericData.Array<?> map(Collection<?> input) {
+ if (schema == null) {
+ schema = new Schema.Parser().parse(jsonSchema);
+ }
+ GenericData.Array array = new GenericData.Array(input.size(), schema);
+ for (Object in : input) {
+ array.add(mapFn.map(in));
+ }
+ return array;
+ }
+ }
+
+ public static final <T> AvroType<Collection<T>> collections(PType<T> ptype) {
+ AvroType<T> avroType = (AvroType<T>) ptype;
+ Schema collectionSchema = Schema.createArray(allowNulls(avroType.getSchema()));
+ GenericDataArrayToCollection<T> input = new GenericDataArrayToCollection<T>(avroType.getInputMapFn());
+ CollectionToGenericDataArray output = new CollectionToGenericDataArray(collectionSchema, avroType.getOutputMapFn());
+ return new AvroType(Collection.class, collectionSchema, input, output, new CollectionDeepCopier<T>(ptype), ptype);
+ }
+
+ private static class AvroMapToMap<T> extends MapFn<Map<CharSequence, Object>, Map<String, T>> {
+ private final MapFn<Object, T> mapFn;
+
+ public AvroMapToMap(MapFn<Object, T> mapFn) {
+ this.mapFn = mapFn;
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ mapFn.configure(conf);
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ mapFn.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ mapFn.initialize();
+ }
+
+ @Override
+ public Map<String, T> map(Map<CharSequence, Object> input) {
+ Map<String, T> out = Maps.newHashMap();
+ for (Map.Entry<CharSequence, Object> e : input.entrySet()) {
+ out.put(e.getKey().toString(), mapFn.map(e.getValue()));
+ }
+ return out;
+ }
+ }
+
+ private static class MapToAvroMap<T> extends MapFn<Map<String, T>, Map<Utf8, Object>> {
+ private final MapFn<T, Object> mapFn;
+
+ public MapToAvroMap(MapFn<T, Object> mapFn) {
+ this.mapFn = mapFn;
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ mapFn.configure(conf);
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ mapFn.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ this.mapFn.initialize();
+ }
+
+ @Override
+ public Map<Utf8, Object> map(Map<String, T> input) {
+ Map<Utf8, Object> out = Maps.newHashMap();
+ for (Map.Entry<String, T> e : input.entrySet()) {
+ out.put(new Utf8(e.getKey()), mapFn.map(e.getValue()));
+ }
+ return out;
+ }
+ }
+
+ public static final <T> AvroType<Map<String, T>> maps(PType<T> ptype) {
+ AvroType<T> avroType = (AvroType<T>) ptype;
+ Schema mapSchema = Schema.createMap(allowNulls(avroType.getSchema()));
+ AvroMapToMap<T> inputFn = new AvroMapToMap<T>(avroType.getInputMapFn());
+ MapToAvroMap<T> outputFn = new MapToAvroMap<T>(avroType.getOutputMapFn());
+ return new AvroType(Map.class, mapSchema, inputFn, outputFn, new MapDeepCopier<T>(ptype), ptype);
+ }
+
+ private static class GenericRecordToTuple extends MapFn<GenericRecord, Tuple> {
+ private final TupleFactory<?> tupleFactory;
+ private final List<MapFn> fns;
+
+ private transient Object[] values;
+
+ public GenericRecordToTuple(TupleFactory<?> tupleFactory, PType<?>... ptypes) {
+ this.tupleFactory = tupleFactory;
+ this.fns = Lists.newArrayList();
+ for (PType<?> ptype : ptypes) {
+ AvroType atype = (AvroType) ptype;
+ fns.add(atype.getInputMapFn());
+ }
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ for (MapFn fn : fns) {
+ fn.configure(conf);
+ }
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ for (MapFn fn : fns) {
+ fn.setContext(context);
+ }
+ }
+
+ @Override
+ public void initialize() {
+ for (MapFn fn : fns) {
+ fn.initialize();
+ }
+ this.values = new Object[fns.size()];
+ tupleFactory.initialize();
+ }
+
+ @Override
+ public Tuple map(GenericRecord input) {
+ for (int i = 0; i < values.length; i++) {
+ Object v = input.get(i);
+ if (v == null) {
+ values[i] = null;
+ } else {
+ values[i] = fns.get(i).map(v);
+ }
+ }
+ return tupleFactory.makeTuple(values);
+ }
+ }
+
+ private static class TupleToGenericRecord extends MapFn<Tuple, GenericRecord> {
+ private final List<MapFn> fns;
+ private final List<AvroType> avroTypes;
+ private final String jsonSchema;
+ private final boolean isReflect;
+ private transient Schema schema;
+
+ public TupleToGenericRecord(Schema schema, PType<?>... ptypes) {
+ this.fns = Lists.newArrayList();
+ this.avroTypes = Lists.newArrayList();
+ this.jsonSchema = schema.toString();
+ boolean reflectFound = false;
+ boolean specificFound = false;
+ for (PType ptype : ptypes) {
+ AvroType atype = (AvroType) ptype;
+ fns.add(atype.getOutputMapFn());
+ avroTypes.add(atype);
+ if (atype.hasReflect()) {
+ reflectFound = true;
+ }
+ if (atype.hasSpecific()) {
+ specificFound = true;
+ }
+ }
+ if (specificFound && reflectFound) {
+ checkCombiningSpecificAndReflectionSchemas();
+ }
+ this.isReflect = reflectFound;
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ for (MapFn fn : fns) {
+ fn.configure(conf);
+ }
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ for (MapFn fn : fns) {
+ fn.setContext(getContext());
+ }
+ }
+
+ @Override
+ public void initialize() {
+ this.schema = new Schema.Parser().parse(jsonSchema);
+ for (MapFn fn : fns) {
+ fn.initialize();
+ }
+ }
+
+ private GenericRecord createRecord() {
+ if (isReflect) {
+ return new ReflectGenericRecord(schema);
+ } else {
+ return new GenericData.Record(schema);
+ }
+ }
+
+ @Override
+ public GenericRecord map(Tuple input) {
+ GenericRecord record = createRecord();
+ for (int i = 0; i < input.size(); i++) {
+ Object v = input.get(i);
+ if (v == null) {
+ record.put(i, null);
+ } else {
+ record.put(i, fns.get(i).map(v));
+ }
+ }
+ return record;
+ }
+ }
+
+ public static final <V1, V2> AvroType<Pair<V1, V2>> pairs(PType<V1> p1, PType<V2> p2) {
+ Schema schema = createTupleSchema(p1, p2);
+ GenericRecordToTuple input = new GenericRecordToTuple(TupleFactory.PAIR, p1, p2);
+ TupleToGenericRecord output = new TupleToGenericRecord(schema, p1, p2);
+ return new AvroType(Pair.class, schema, input, output, new TupleDeepCopier(Pair.class, p1, p2), p1, p2);
+ }
+
+ public static final <V1, V2, V3> AvroType<Tuple3<V1, V2, V3>> triples(PType<V1> p1, PType<V2> p2, PType<V3> p3) {
+ Schema schema = createTupleSchema(p1, p2, p3);
+ return new AvroType(Tuple3.class, schema, new GenericRecordToTuple(TupleFactory.TUPLE3, p1, p2, p3),
+ new TupleToGenericRecord(schema, p1, p2, p3), new TupleDeepCopier(Tuple3.class, p1, p2, p3), p1, p2, p3);
+ }
+
+ public static final <V1, V2, V3, V4> AvroType<Tuple4<V1, V2, V3, V4>> quads(PType<V1> p1, PType<V2> p2, PType<V3> p3,
+ PType<V4> p4) {
+ Schema schema = createTupleSchema(p1, p2, p3, p4);
+ return new AvroType(Tuple4.class, schema, new GenericRecordToTuple(TupleFactory.TUPLE4, p1, p2, p3, p4),
+ new TupleToGenericRecord(schema, p1, p2, p3, p4), new TupleDeepCopier(Tuple4.class, p1, p2, p3, p4), p1, p2,
+ p3, p4);
+ }
+
+ public static final AvroType<TupleN> tuples(PType... ptypes) {
+ Schema schema = createTupleSchema(ptypes);
+ return new AvroType(TupleN.class, schema, new GenericRecordToTuple(TupleFactory.TUPLEN, ptypes),
+ new TupleToGenericRecord(schema, ptypes), new TupleDeepCopier(TupleN.class, ptypes), ptypes);
+ }
+
+ public static <T extends Tuple> AvroType<T> tuples(Class<T> clazz, PType... ptypes) {
+ Schema schema = createTupleSchema(ptypes);
+ Class[] typeArgs = new Class[ptypes.length];
+ for (int i = 0; i < typeArgs.length; i++) {
+ typeArgs[i] = ptypes[i].getTypeClass();
+ }
+ TupleFactory<T> factory = TupleFactory.create(clazz, typeArgs);
+ return new AvroType<T>(clazz, schema, new GenericRecordToTuple(factory, ptypes), new TupleToGenericRecord(schema,
+ ptypes), new TupleDeepCopier(clazz, ptypes), ptypes);
+ }
+
+ private static Schema createTupleSchema(PType<?>... ptypes) {
+ // Guarantee each tuple schema has a globally unique name
+ String tupleName = "tuple" + UUID.randomUUID().toString().replace('-', 'x');
+ Schema schema = Schema.createRecord(tupleName, "", "crunch", false);
+ List<Schema.Field> fields = Lists.newArrayList();
+ for (int i = 0; i < ptypes.length; i++) {
+ AvroType atype = (AvroType) ptypes[i];
+ Schema fieldSchema = allowNulls(atype.getSchema());
+ fields.add(new Schema.Field("v" + i, fieldSchema, "", null));
+ }
+ schema.setFields(fields);
+ return schema;
+ }
+
+ public static final <S, T> AvroType<T> derived(Class<T> clazz, MapFn<S, T> inputFn, MapFn<T, S> outputFn,
+ PType<S> base) {
+ AvroType<S> abase = (AvroType<S>) base;
+ return new AvroType<T>(clazz, abase.getSchema(), new CompositeMapFn(abase.getInputMapFn(), inputFn),
+ new CompositeMapFn(outputFn, abase.getOutputMapFn()), new DeepCopier.NoOpDeepCopier<T>(), base.getSubTypes()
+ .toArray(new PType[0]));
+ }
+
+ public static <T> PType<T> jsons(Class<T> clazz) {
+ return PTypes.jsonString(clazz, AvroTypeFamily.getInstance());
+ }
+
+ public static final <K, V> AvroTableType<K, V> tableOf(PType<K> key, PType<V> value) {
+ if (key instanceof PTableType) {
+ PTableType ptt = (PTableType) key;
+ key = Avros.pairs(ptt.getKeyType(), ptt.getValueType());
+ }
+ if (value instanceof PTableType) {
+ PTableType ptt = (PTableType) value;
+ value = Avros.pairs(ptt.getKeyType(), ptt.getValueType());
+ }
+ AvroType<K> avroKey = (AvroType<K>) key;
+ AvroType<V> avroValue = (AvroType<V>) value;
+ return new AvroTableType(avroKey, avroValue, Pair.class);
+ }
+
+ private static final Schema NULL_SCHEMA = Schema.create(Type.NULL);
+
+ private static Schema allowNulls(Schema base) {
+ if (NULL_SCHEMA.equals(base)) {
+ return base;
+ }
+ return Schema.createUnion(ImmutableList.of(base, NULL_SCHEMA));
+ }
+
+ private static class ReflectGenericRecord extends GenericData.Record {
+
+ public ReflectGenericRecord(Schema schema) {
+ super(schema);
+ }
+
+ @Override
+ public int hashCode() {
+ return reflectAwareHashCode(this, getSchema());
+ }
+ }
+
+ /*
+ * TODO: Remove this once we no longer have to support 1.5.4.
+ */
+ private static int reflectAwareHashCode(Object o, Schema s) {
+ if (o == null)
+ return 0; // incomplete datum
+ int hashCode = 1;
+ switch (s.getType()) {
+ case RECORD:
+ for (Schema.Field f : s.getFields()) {
+ if (f.order() == Schema.Field.Order.IGNORE)
+ continue;
+ hashCode = hashCodeAdd(hashCode, ReflectData.get().getField(o, f.name(), f.pos()), f.schema());
+ }
+ return hashCode;
+ case ARRAY:
+ Collection<?> a = (Collection<?>) o;
+ Schema elementType = s.getElementType();
+ for (Object e : a)
+ hashCode = hashCodeAdd(hashCode, e, elementType);
+ return hashCode;
+ case UNION:
+ return reflectAwareHashCode(o, s.getTypes().get(ReflectData.get().resolveUnion(s, o)));
+ case ENUM:
+ return s.getEnumOrdinal(o.toString());
+ case NULL:
+ return 0;
+ case STRING:
+ return (o instanceof Utf8 ? o : new Utf8(o.toString())).hashCode();
+ default:
+ return o.hashCode();
+ }
+ }
+
+ /** Add the hash code for an object into an accumulated hash code. */
+ private static int hashCodeAdd(int hashCode, Object o, Schema s) {
+ return 31 * hashCode + reflectAwareHashCode(o, s);
+ }
+
+ private Avros() {
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/ReflectDataFactory.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/ReflectDataFactory.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/ReflectDataFactory.java
new file mode 100644
index 0000000..e973cca
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/ReflectDataFactory.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import org.apache.avro.Schema;
+import org.apache.avro.reflect.ReflectData;
+import org.apache.avro.reflect.ReflectDatumReader;
+import org.apache.avro.reflect.ReflectDatumWriter;
+
+/**
+ * A Factory class for constructing Avro reflection-related objects.
+ */
+public class ReflectDataFactory {
+
+ public ReflectData getReflectData() {
+ return ReflectData.AllowNull.get();
+ }
+
+ public <T> ReflectDatumReader<T> getReader(Schema schema) {
+ return new ReflectDatumReader<T>(schema);
+ }
+
+ public <T> ReflectDatumWriter<T> getWriter(Schema schema) {
+ return new ReflectDatumWriter<T>(schema);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/SafeAvroSerialization.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/SafeAvroSerialization.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/SafeAvroSerialization.java
new file mode 100644
index 0000000..8bd18b0
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/SafeAvroSerialization.java
@@ -0,0 +1,145 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import org.apache.avro.Schema;
+import org.apache.avro.io.BinaryDecoder;
+import org.apache.avro.io.BinaryEncoder;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.io.DatumWriter;
+import org.apache.avro.io.DecoderFactory;
+import org.apache.avro.io.EncoderFactory;
+import org.apache.avro.mapred.AvroJob;
+import org.apache.avro.mapred.AvroKey;
+import org.apache.avro.mapred.AvroValue;
+import org.apache.avro.mapred.AvroWrapper;
+import org.apache.avro.mapred.Pair;
+import org.apache.avro.reflect.ReflectDatumWriter;
+import org.apache.avro.specific.SpecificDatumReader;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.serializer.Deserializer;
+import org.apache.hadoop.io.serializer.Serialization;
+import org.apache.hadoop.io.serializer.Serializer;
+import org.apache.hadoop.util.ReflectionUtils;
+
+/** The {@link Serialization} used by jobs configured with {@link AvroJob}. */
+class SafeAvroSerialization<T> extends Configured implements Serialization<AvroWrapper<T>> {
+
+ public boolean accept(Class<?> c) {
+ return AvroWrapper.class.isAssignableFrom(c);
+ }
+
+ /**
+ * Returns the specified map output deserializer. Defaults to the final output
+ * deserializer if no map output schema was specified.
+ */
+ public Deserializer<AvroWrapper<T>> getDeserializer(Class<AvroWrapper<T>> c) {
+ boolean isKey = AvroKey.class.isAssignableFrom(c);
+ Configuration conf = getConf();
+ Schema schema = isKey ? Pair.getKeySchema(AvroJob.getMapOutputSchema(conf)) : Pair.getValueSchema(AvroJob
+ .getMapOutputSchema(conf));
+
+ DatumReader<T> datumReader = null;
+ if (conf.getBoolean(AvroJob.MAP_OUTPUT_IS_REFLECT, false)) {
+ ReflectDataFactory factory = (ReflectDataFactory) ReflectionUtils.newInstance(
+ conf.getClass("crunch.reflectdatafactory", ReflectDataFactory.class), conf);
+ datumReader = factory.getReader(schema);
+ } else {
+ datumReader = new SpecificDatumReader<T>(schema);
+ }
+ return new AvroWrapperDeserializer(datumReader, isKey);
+ }
+
+ private static final DecoderFactory FACTORY = DecoderFactory.get();
+
+ private class AvroWrapperDeserializer implements Deserializer<AvroWrapper<T>> {
+
+ private DatumReader<T> reader;
+ private BinaryDecoder decoder;
+ private boolean isKey;
+
+ public AvroWrapperDeserializer(DatumReader<T> reader, boolean isKey) {
+ this.reader = reader;
+ this.isKey = isKey;
+ }
+
+ public void open(InputStream in) {
+ this.decoder = FACTORY.directBinaryDecoder(in, decoder);
+ }
+
+ public AvroWrapper<T> deserialize(AvroWrapper<T> wrapper) throws IOException {
+ T datum = reader.read(wrapper == null ? null : wrapper.datum(), decoder);
+ if (wrapper == null) {
+ wrapper = isKey ? new AvroKey<T>(datum) : new AvroValue<T>(datum);
+ } else {
+ wrapper.datum(datum);
+ }
+ return wrapper;
+ }
+
+ public void close() throws IOException {
+ decoder.inputStream().close();
+ }
+ }
+
+ /** Returns the specified output serializer. */
+ public Serializer<AvroWrapper<T>> getSerializer(Class<AvroWrapper<T>> c) {
+ // AvroWrapper used for final output, AvroKey or AvroValue for map output
+ boolean isFinalOutput = c.equals(AvroWrapper.class);
+ Configuration conf = getConf();
+ Schema schema = isFinalOutput ? AvroJob.getOutputSchema(conf) : (AvroKey.class.isAssignableFrom(c) ? Pair
+ .getKeySchema(AvroJob.getMapOutputSchema(conf)) : Pair.getValueSchema(AvroJob.getMapOutputSchema(conf)));
+
+ ReflectDataFactory factory = Avros.getReflectDataFactory(conf);
+ ReflectDatumWriter<T> writer = factory.getWriter(schema);
+ return new AvroWrapperSerializer(writer);
+ }
+
+ private class AvroWrapperSerializer implements Serializer<AvroWrapper<T>> {
+ private DatumWriter<T> writer;
+ private OutputStream out;
+ private BinaryEncoder encoder;
+
+ public AvroWrapperSerializer(DatumWriter<T> writer) {
+ this.writer = writer;
+ }
+
+ public void open(OutputStream out) {
+ this.out = out;
+ this.encoder = new EncoderFactory().configureBlockSize(512).binaryEncoder(out, null);
+ }
+
+ public void serialize(AvroWrapper<T> wrapper) throws IOException {
+ writer.write(wrapper.datum(), encoder);
+ // would be a lot faster if the Serializer interface had a flush()
+ // method and the Hadoop framework called it when needed rather
+ // than for every record.
+ encoder.flush();
+ }
+
+ public void close() throws IOException {
+ out.close();
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/package-info.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/package-info.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/package-info.java
new file mode 100644
index 0000000..abaf60f
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/package-info.java
@@ -0,0 +1,22 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Business object serialization using Apache Avro.
+ */
+package org.apache.crunch.types.avro;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/package-info.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/package-info.java b/crunch-core/src/main/java/org/apache/crunch/types/package-info.java
new file mode 100644
index 0000000..b420b03
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/package-info.java
@@ -0,0 +1,22 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Common functionality for business object serialization.
+ */
+package org.apache.crunch.types;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/writable/GenericArrayWritable.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/writable/GenericArrayWritable.java b/crunch-core/src/main/java/org/apache/crunch/types/writable/GenericArrayWritable.java
new file mode 100644
index 0000000..8b54008
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/writable/GenericArrayWritable.java
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableFactories;
+import org.apache.hadoop.io.WritableUtils;
+
+/**
+ * A {@link Writable} for marshalling/unmarshalling Collections. Note that
+ * element order is <em>undefined</em>!
+ *
+ * @param <T> The value type
+ */
+class GenericArrayWritable<T> implements Writable {
+ private Writable[] values;
+ private Class<? extends Writable> valueClass;
+
+ public GenericArrayWritable(Class<? extends Writable> valueClass) {
+ this.valueClass = valueClass;
+ }
+
+ public GenericArrayWritable() {
+ // for deserialization
+ }
+
+ public void set(Writable[] values) {
+ this.values = values;
+ }
+
+ public Writable[] get() {
+ return values;
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ values = new Writable[WritableUtils.readVInt(in)]; // construct values
+ if (values.length > 0) {
+ int nulls = WritableUtils.readVInt(in);
+ if (nulls == values.length) {
+ return;
+ }
+ String valueType = Text.readString(in);
+ setValueType(valueType);
+ for (int i = 0; i < values.length - nulls; i++) {
+ Writable value = WritableFactories.newInstance(valueClass);
+ value.readFields(in); // read a value
+ values[i] = value; // store it in values
+ }
+ }
+ }
+
+ protected void setValueType(String valueType) {
+ if (valueClass == null) {
+ try {
+ valueClass = Class.forName(valueType).asSubclass(Writable.class);
+ } catch (ClassNotFoundException e) {
+ throw new CrunchRuntimeException(e);
+ }
+ } else if (!valueType.equals(valueClass.getName())) {
+ throw new IllegalStateException("Incoming " + valueType + " is not " + valueClass);
+ }
+ }
+
+ public void write(DataOutput out) throws IOException {
+ WritableUtils.writeVInt(out, values.length);
+ if (values.length > 0) {
+ int nulls = 0;
+ for (int i = 0; i < values.length; i++) {
+ if (values[i] == null) {
+ nulls++;
+ }
+ }
+ WritableUtils.writeVInt(out, nulls);
+ if (values.length - nulls > 0) {
+ if (valueClass == null) {
+ throw new IllegalStateException("Value class not set by constructor or read");
+ }
+ Text.writeString(out, valueClass.getName());
+ for (int i = 0; i < values.length; i++) {
+ if (values[i] != null) {
+ values[i].write(out);
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ HashCodeBuilder hcb = new HashCodeBuilder();
+ return hcb.append(values).toHashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ GenericArrayWritable other = (GenericArrayWritable) obj;
+ if (!Arrays.equals(values, other.values))
+ return false;
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ return Arrays.toString(values);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/writable/TextMapWritable.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/writable/TextMapWritable.java b/crunch-core/src/main/java/org/apache/crunch/types/writable/TextMapWritable.java
new file mode 100644
index 0000000..1ab51df
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/writable/TextMapWritable.java
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+
+import com.google.common.collect.Maps;
+
+class TextMapWritable<T extends Writable> implements Writable {
+
+ private Class<T> valueClazz;
+ private final Map<Text, T> instance;
+
+ public TextMapWritable() {
+ this.instance = Maps.newHashMap();
+ }
+
+ public TextMapWritable(Class<T> valueClazz) {
+ this.valueClazz = valueClazz;
+ this.instance = Maps.newHashMap();
+ }
+
+ public void put(Text txt, T value) {
+ instance.put(txt, value);
+ }
+
+ public Set<Map.Entry<Text, T>> entrySet() {
+ return instance.entrySet();
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ instance.clear();
+ try {
+ this.valueClazz = (Class<T>) Class.forName(Text.readString(in));
+ } catch (ClassNotFoundException e) {
+ throw (IOException) new IOException("Failed map init").initCause(e);
+ }
+ int entries = WritableUtils.readVInt(in);
+ try {
+ for (int i = 0; i < entries; i++) {
+ Text txt = new Text();
+ txt.readFields(in);
+ T value = valueClazz.newInstance();
+ value.readFields(in);
+ instance.put(txt, value);
+ }
+ } catch (IllegalAccessException e) {
+ throw (IOException) new IOException("Failed map init").initCause(e);
+ } catch (InstantiationException e) {
+ throw (IOException) new IOException("Failed map init").initCause(e);
+ }
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ Text.writeString(out, valueClazz.getName());
+ WritableUtils.writeVInt(out, instance.size());
+ for (Map.Entry<Text, T> e : instance.entrySet()) {
+ e.getKey().write(out);
+ e.getValue().write(out);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/writable/TupleWritable.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/writable/TupleWritable.java b/crunch-core/src/main/java/org/apache/crunch/types/writable/TupleWritable.java
new file mode 100644
index 0000000..1c3536b
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/writable/TupleWritable.java
@@ -0,0 +1,224 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableUtils;
+
+/**
+ * A straight copy of the TupleWritable implementation in the join package,
+ * added here because of its package visibility restrictions.
+ *
+ */
+public class TupleWritable implements WritableComparable<TupleWritable> {
+
+ private long written;
+ private Writable[] values;
+
+ /**
+ * Create an empty tuple with no allocated storage for writables.
+ */
+ public TupleWritable() {
+ }
+
+ /**
+ * Initialize tuple with storage; unknown whether any of them contain
+ * "written" values.
+ */
+ public TupleWritable(Writable[] vals) {
+ written = 0L;
+ values = vals;
+ }
+
+ /**
+ * Return true if tuple has an element at the position provided.
+ */
+ public boolean has(int i) {
+ return 0 != ((1 << i) & written);
+ }
+
+ /**
+ * Get ith Writable from Tuple.
+ */
+ public Writable get(int i) {
+ return values[i];
+ }
+
+ /**
+ * The number of children in this Tuple.
+ */
+ public int size() {
+ return values.length;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public boolean equals(Object other) {
+ if (other instanceof TupleWritable) {
+ TupleWritable that = (TupleWritable) other;
+ if (this.size() != that.size() || this.written != that.written) {
+ return false;
+ }
+ for (int i = 0; i < values.length; ++i) {
+ if (!has(i))
+ continue;
+ if (!values[i].equals(that.get(i))) {
+ return false;
+ }
+ }
+ return true;
+ }
+ return false;
+ }
+
+ public int hashCode() {
+ HashCodeBuilder builder = new HashCodeBuilder();
+ builder.append(written);
+ for (Writable v : values) {
+ builder.append(v);
+ }
+ return builder.toHashCode();
+ }
+
+ /**
+ * Convert Tuple to String as in the following.
+ * <tt>[<child1>,<child2>,...,<childn>]</tt>
+ */
+ public String toString() {
+ StringBuffer buf = new StringBuffer("[");
+ for (int i = 0; i < values.length; ++i) {
+ buf.append(has(i) ? values[i].toString() : "");
+ buf.append(",");
+ }
+ if (values.length != 0)
+ buf.setCharAt(buf.length() - 1, ']');
+ else
+ buf.append(']');
+ return buf.toString();
+ }
+
+ /**
+ * Writes each Writable to <code>out</code>. TupleWritable format:
+ * {@code
+ * <count><type1><type2>...<typen><obj1><obj2>...<objn>
+ * }
+ */
+ public void write(DataOutput out) throws IOException {
+ WritableUtils.writeVInt(out, values.length);
+ WritableUtils.writeVLong(out, written);
+ for (int i = 0; i < values.length; ++i) {
+ if (has(i)) {
+ Text.writeString(out, values[i].getClass().getName());
+ }
+ }
+ for (int i = 0; i < values.length; ++i) {
+ if (has(i)) {
+ values[i].write(out);
+ }
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @SuppressWarnings("unchecked")
+ // No static typeinfo on Tuples
+ public void readFields(DataInput in) throws IOException {
+ int card = WritableUtils.readVInt(in);
+ values = new Writable[card];
+ written = WritableUtils.readVLong(in);
+ Class<? extends Writable>[] cls = new Class[card];
+ try {
+ for (int i = 0; i < card; ++i) {
+ if (has(i)) {
+ cls[i] = Class.forName(Text.readString(in)).asSubclass(Writable.class);
+ }
+ }
+ for (int i = 0; i < card; ++i) {
+ if (has(i)) {
+ values[i] = cls[i].newInstance();
+ values[i].readFields(in);
+ }
+ }
+ } catch (ClassNotFoundException e) {
+ throw (IOException) new IOException("Failed tuple init").initCause(e);
+ } catch (IllegalAccessException e) {
+ throw (IOException) new IOException("Failed tuple init").initCause(e);
+ } catch (InstantiationException e) {
+ throw (IOException) new IOException("Failed tuple init").initCause(e);
+ }
+ }
+
+ /**
+ * Record that the tuple contains an element at the position provided.
+ */
+ public void setWritten(int i) {
+ written |= 1 << i;
+ }
+
+ /**
+ * Record that the tuple does not contain an element at the position provided.
+ */
+ public void clearWritten(int i) {
+ written &= -1 ^ (1 << i);
+ }
+
+ /**
+ * Clear any record of which writables have been written to, without releasing
+ * storage.
+ */
+ public void clearWritten() {
+ written = 0L;
+ }
+
+ @Override
+ public int compareTo(TupleWritable o) {
+ for (int i = 0; i < values.length; ++i) {
+ if (has(i) && !o.has(i)) {
+ return 1;
+ } else if (!has(i) && o.has(i)) {
+ return -1;
+ } else {
+ Writable v1 = values[i];
+ Writable v2 = o.values[i];
+ if (v1 != v2 && (v1 != null && !v1.equals(v2))) {
+ if (v1 instanceof WritableComparable && v2 instanceof WritableComparable) {
+ int cmp = ((WritableComparable) v1).compareTo((WritableComparable) v2);
+ if (cmp != 0) {
+ return cmp;
+ }
+ } else {
+ int cmp = v1.hashCode() - v2.hashCode();
+ if (cmp != 0) {
+ return cmp;
+ }
+ }
+ }
+ }
+ }
+ return values.length - o.values.length;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableDeepCopier.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableDeepCopier.java b/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableDeepCopier.java
new file mode 100644
index 0000000..7b6e11b
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableDeepCopier.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.types.DeepCopier;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * Performs deep copies of Writable values.
+ *
+ * @param <T> The type of Writable that can be copied
+ */
+public class WritableDeepCopier<T extends Writable> implements DeepCopier<T> {
+
+ private Class<T> writableClass;
+
+ public WritableDeepCopier(Class<T> writableClass) {
+ this.writableClass = writableClass;
+ }
+
+ @Override
+ public void initialize(Configuration conf) {
+ }
+
+ @Override
+ public T deepCopy(T source) {
+
+ if (source == null) {
+ return null;
+ }
+
+ ByteArrayOutputStream byteOutStream = new ByteArrayOutputStream();
+ DataOutputStream dataOut = new DataOutputStream(byteOutStream);
+ T copiedValue = null;
+ try {
+ source.write(dataOut);
+ dataOut.flush();
+ ByteArrayInputStream byteInStream = new ByteArrayInputStream(byteOutStream.toByteArray());
+ DataInput dataInput = new DataInputStream(byteInStream);
+ copiedValue = writableClass.newInstance();
+ copiedValue.readFields(dataInput);
+ } catch (Exception e) {
+ throw new CrunchRuntimeException("Error while deep copying " + source, e);
+ }
+ return copiedValue;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableGroupedTableType.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableGroupedTableType.java b/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableGroupedTableType.java
new file mode 100644
index 0000000..84318d3
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableGroupedTableType.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import org.apache.crunch.GroupingOptions;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Pair;
+import org.apache.crunch.lib.PTables;
+import org.apache.crunch.types.Converter;
+import org.apache.crunch.types.PGroupedTableType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.Job;
+
+class WritableGroupedTableType<K, V> extends PGroupedTableType<K, V> {
+
+ private final MapFn inputFn;
+ private final MapFn outputFn;
+ private final Converter converter;
+
+ public WritableGroupedTableType(WritableTableType<K, V> tableType) {
+ super(tableType);
+ WritableType keyType = (WritableType) tableType.getKeyType();
+ WritableType valueType = (WritableType) tableType.getValueType();
+ this.inputFn = new PairIterableMapFn(keyType.getInputMapFn(), valueType.getInputMapFn());
+ this.outputFn = tableType.getOutputMapFn();
+ this.converter = new WritablePairConverter(keyType.getSerializationClass(),
+ valueType.getSerializationClass());
+ }
+
+ @Override
+ public Class<Pair<K, Iterable<V>>> getTypeClass() {
+ return (Class<Pair<K, Iterable<V>>>) Pair.of(null, null).getClass();
+ }
+
+ @Override
+ public Converter getGroupingConverter() {
+ return converter;
+ }
+
+ @Override
+ public MapFn getInputMapFn() {
+ return inputFn;
+ }
+
+ @Override
+ public MapFn getOutputMapFn() {
+ return outputFn;
+ }
+
+ @Override
+ public void initialize(Configuration conf) {
+ this.tableType.initialize(conf);
+ }
+
+ @Override
+ public Pair<K, Iterable<V>> getDetachedValue(Pair<K, Iterable<V>> value) {
+ return PTables.getGroupedDetachedValue(this, value);
+ }
+
+ @Override
+ public void configureShuffle(Job job, GroupingOptions options) {
+ if (options != null) {
+ options.configure(job);
+ }
+ WritableType keyType = (WritableType) tableType.getKeyType();
+ WritableType valueType = (WritableType) tableType.getValueType();
+ job.setMapOutputKeyClass(keyType.getSerializationClass());
+ job.setMapOutputValueClass(valueType.getSerializationClass());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/writable/WritablePairConverter.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/writable/WritablePairConverter.java b/crunch-core/src/main/java/org/apache/crunch/types/writable/WritablePairConverter.java
new file mode 100644
index 0000000..2db0238
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/writable/WritablePairConverter.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.Converter;
+
+class WritablePairConverter<K, V> implements Converter<K, V, Pair<K, V>, Pair<K, Iterable<V>>> {
+
+ private final Class<K> keyClass;
+ private final Class<V> valueClass;
+
+ public WritablePairConverter(Class<K> keyClass, Class<V> valueClass) {
+ this.keyClass = keyClass;
+ this.valueClass = valueClass;
+ }
+
+ @Override
+ public Pair<K, V> convertInput(K key, V value) {
+ return Pair.of(key, value);
+ }
+
+ @Override
+ public K outputKey(Pair<K, V> value) {
+ return value.first();
+ }
+
+ @Override
+ public V outputValue(Pair<K, V> value) {
+ return value.second();
+ }
+
+ @Override
+ public Class<K> getKeyClass() {
+ return keyClass;
+ }
+
+ @Override
+ public Class<V> getValueClass() {
+ return valueClass;
+ }
+
+ @Override
+ public Pair<K, Iterable<V>> convertIterableInput(K key, Iterable<V> value) {
+ return Pair.of(key, value);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableTableType.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableTableType.java b/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableTableType.java
new file mode 100644
index 0000000..93e0fd6
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableTableType.java
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import java.util.List;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Pair;
+import org.apache.crunch.fn.PairMapFn;
+import org.apache.crunch.io.ReadableSourceTarget;
+import org.apache.crunch.io.seq.SeqFileTableSourceTarget;
+import org.apache.crunch.lib.PTables;
+import org.apache.crunch.types.Converter;
+import org.apache.crunch.types.PGroupedTableType;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+
+import com.google.common.collect.ImmutableList;
+
+class WritableTableType<K, V> implements PTableType<K, V> {
+
+ private final WritableType<K, Writable> keyType;
+ private final WritableType<V, Writable> valueType;
+ private final MapFn inputFn;
+ private final MapFn outputFn;
+ private final Converter converter;
+
+ public WritableTableType(WritableType<K, Writable> keyType, WritableType<V, Writable> valueType) {
+ this.keyType = keyType;
+ this.valueType = valueType;
+ this.inputFn = new PairMapFn(keyType.getInputMapFn(), valueType.getInputMapFn());
+ this.outputFn = new PairMapFn(keyType.getOutputMapFn(), valueType.getOutputMapFn());
+ this.converter = new WritablePairConverter(keyType.getSerializationClass(),
+ valueType.getSerializationClass());
+ }
+
+ @Override
+ public Class<Pair<K, V>> getTypeClass() {
+ return (Class<Pair<K, V>>) Pair.of(null, null).getClass();
+ }
+
+ @Override
+ public List<PType> getSubTypes() {
+ return ImmutableList.<PType> of(keyType, valueType);
+ }
+
+ @Override
+ public MapFn getInputMapFn() {
+ return inputFn;
+ }
+
+ @Override
+ public MapFn getOutputMapFn() {
+ return outputFn;
+ }
+
+ @Override
+ public Converter getConverter() {
+ return converter;
+ }
+
+ @Override
+ public PTypeFamily getFamily() {
+ return WritableTypeFamily.getInstance();
+ }
+
+ public PType<K> getKeyType() {
+ return keyType;
+ }
+
+ public PType<V> getValueType() {
+ return valueType;
+ }
+
+ @Override
+ public PGroupedTableType<K, V> getGroupedTableType() {
+ return new WritableGroupedTableType<K, V>(this);
+ }
+
+ @Override
+ public ReadableSourceTarget<Pair<K, V>> getDefaultFileSource(Path path) {
+ return new SeqFileTableSourceTarget<K, V>(path, this);
+ }
+
+ @Override
+ public void initialize(Configuration conf) {
+ keyType.initialize(conf);
+ valueType.initialize(conf);
+ }
+
+ @Override
+ public Pair<K, V> getDetachedValue(Pair<K, V> value) {
+ return PTables.getDetachedValue(this, value);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || !(obj instanceof WritableTableType)) {
+ return false;
+ }
+ WritableTableType that = (WritableTableType) obj;
+ return keyType.equals(that.keyType) && valueType.equals(that.valueType);
+ }
+
+ @Override
+ public int hashCode() {
+ HashCodeBuilder hcb = new HashCodeBuilder();
+ return hcb.append(keyType).append(valueType).toHashCode();
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableType.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableType.java b/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableType.java
new file mode 100644
index 0000000..734946c
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableType.java
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import java.util.List;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.io.ReadableSourceTarget;
+import org.apache.crunch.io.seq.SeqFileSourceTarget;
+import org.apache.crunch.types.Converter;
+import org.apache.crunch.types.DeepCopier;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+
+import com.google.common.collect.ImmutableList;
+
+public class WritableType<T, W extends Writable> implements PType<T> {
+
+ private final Class<T> typeClass;
+ private final Class<W> writableClass;
+ private final Converter converter;
+ private final MapFn<W, T> inputFn;
+ private final MapFn<T, W> outputFn;
+ private final DeepCopier<W> deepCopier;
+ private final List<PType> subTypes;
+ private boolean initialized = false;
+
+ public WritableType(Class<T> typeClass, Class<W> writableClass, MapFn<W, T> inputDoFn,
+ MapFn<T, W> outputDoFn, PType... subTypes) {
+ this.typeClass = typeClass;
+ this.writableClass = writableClass;
+ this.inputFn = inputDoFn;
+ this.outputFn = outputDoFn;
+ this.converter = new WritableValueConverter(writableClass);
+ this.deepCopier = new WritableDeepCopier<W>(writableClass);
+ this.subTypes = ImmutableList.<PType> builder().add(subTypes).build();
+ }
+
+ @Override
+ public PTypeFamily getFamily() {
+ return WritableTypeFamily.getInstance();
+ }
+
+ @Override
+ public Class<T> getTypeClass() {
+ return typeClass;
+ }
+
+ @Override
+ public Converter getConverter() {
+ return converter;
+ }
+
+ @Override
+ public MapFn getInputMapFn() {
+ return inputFn;
+ }
+
+ @Override
+ public MapFn getOutputMapFn() {
+ return outputFn;
+ }
+
+ @Override
+ public List<PType> getSubTypes() {
+ return subTypes;
+ }
+
+ public Class<W> getSerializationClass() {
+ return writableClass;
+ }
+
+ @Override
+ public ReadableSourceTarget<T> getDefaultFileSource(Path path) {
+ return new SeqFileSourceTarget<T>(path, this);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || !(obj instanceof WritableType)) {
+ return false;
+ }
+ WritableType wt = (WritableType) obj;
+ return (typeClass.equals(wt.typeClass) && writableClass.equals(wt.writableClass) && subTypes
+ .equals(wt.subTypes));
+ }
+
+ @Override
+ public void initialize(Configuration conf) {
+ this.inputFn.initialize();
+ this.outputFn.initialize();
+ for (PType subType : subTypes) {
+ subType.initialize(conf);
+ }
+ this.initialized = true;
+ }
+
+ @Override
+ public T getDetachedValue(T value) {
+ if (!initialized) {
+ throw new IllegalStateException("Cannot call getDetachedValue on an uninitialized PType");
+ }
+ W writableValue = outputFn.map(value);
+ W deepCopy = this.deepCopier.deepCopy(writableValue);
+ return inputFn.map(deepCopy);
+ }
+
+ @Override
+ public int hashCode() {
+ HashCodeBuilder hcb = new HashCodeBuilder();
+ hcb.append(typeClass).append(writableClass).append(subTypes);
+ return hcb.toHashCode();
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableTypeFamily.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableTypeFamily.java b/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableTypeFamily.java
new file mode 100644
index 0000000..a94db96
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableTypeFamily.java
@@ -0,0 +1,147 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.Map;
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Tuple;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+import org.apache.crunch.types.PGroupedTableType;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.PTypeUtils;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * The {@link Writable}-based implementation of the
+ * {@link org.apache.crunch.types.PTypeFamily} interface.
+ */
+public class WritableTypeFamily implements PTypeFamily {
+
+ private static final WritableTypeFamily INSTANCE = new WritableTypeFamily();
+
+ public static WritableTypeFamily getInstance() {
+ return INSTANCE;
+ }
+
+ // Disallow construction
+ private WritableTypeFamily() {
+ }
+
+ public PType<Void> nulls() {
+ return Writables.nulls();
+ }
+
+ public PType<String> strings() {
+ return Writables.strings();
+ }
+
+ public PType<Long> longs() {
+ return Writables.longs();
+ }
+
+ public PType<Integer> ints() {
+ return Writables.ints();
+ }
+
+ public PType<Float> floats() {
+ return Writables.floats();
+ }
+
+ public PType<Double> doubles() {
+ return Writables.doubles();
+ }
+
+ public PType<Boolean> booleans() {
+ return Writables.booleans();
+ }
+
+ public PType<ByteBuffer> bytes() {
+ return Writables.bytes();
+ }
+
+ public <T> PType<T> records(Class<T> clazz) {
+ return Writables.records(clazz);
+ }
+
+ public <W extends Writable> PType<W> writables(Class<W> clazz) {
+ return Writables.writables(clazz);
+ }
+
+ public <K, V> PTableType<K, V> tableOf(PType<K> key, PType<V> value) {
+ return Writables.tableOf(key, value);
+ }
+
+ public <V1, V2> PType<Pair<V1, V2>> pairs(PType<V1> p1, PType<V2> p2) {
+ return Writables.pairs(p1, p2);
+ }
+
+ public <V1, V2, V3> PType<Tuple3<V1, V2, V3>> triples(PType<V1> p1, PType<V2> p2, PType<V3> p3) {
+ return Writables.triples(p1, p2, p3);
+ }
+
+ public <V1, V2, V3, V4> PType<Tuple4<V1, V2, V3, V4>> quads(PType<V1> p1, PType<V2> p2, PType<V3> p3, PType<V4> p4) {
+ return Writables.quads(p1, p2, p3, p4);
+ }
+
+ public PType<TupleN> tuples(PType<?>... ptypes) {
+ return Writables.tuples(ptypes);
+ }
+
+ public <T> PType<Collection<T>> collections(PType<T> ptype) {
+ return Writables.collections(ptype);
+ }
+
+ public <T> PType<Map<String, T>> maps(PType<T> ptype) {
+ return Writables.maps(ptype);
+ }
+
+ @Override
+ public <T> PType<T> as(PType<T> ptype) {
+ if (ptype instanceof WritableType || ptype instanceof WritableTableType
+ || ptype instanceof WritableGroupedTableType) {
+ return ptype;
+ }
+ if (ptype instanceof PGroupedTableType) {
+ PTableType ptt = ((PGroupedTableType) ptype).getTableType();
+ return new WritableGroupedTableType((WritableTableType) as(ptt));
+ }
+ PType<T> prim = Writables.getPrimitiveType(ptype.getTypeClass());
+ if (prim != null) {
+ return prim;
+ }
+ return PTypeUtils.convert(ptype, this);
+ }
+
+ @Override
+ public <T extends Tuple> PType<T> tuples(Class<T> clazz, PType<?>... ptypes) {
+ return Writables.tuples(clazz, ptypes);
+ }
+
+ @Override
+ public <S, T> PType<T> derived(Class<T> clazz, MapFn<S, T> inputFn, MapFn<T, S> outputFn, PType<S> base) {
+ return Writables.derived(clazz, inputFn, outputFn, base);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableValueConverter.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableValueConverter.java b/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableValueConverter.java
new file mode 100644
index 0000000..3670b90
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/writable/WritableValueConverter.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import org.apache.crunch.types.Converter;
+import org.apache.hadoop.io.NullWritable;
+
+class WritableValueConverter<W> implements Converter<Object, W, W, Iterable<W>> {
+
+ private final Class<W> serializationClass;
+
+ public WritableValueConverter(Class<W> serializationClass) {
+ this.serializationClass = serializationClass;
+ }
+
+ @Override
+ public W convertInput(Object key, W value) {
+ return value;
+ }
+
+ @Override
+ public Object outputKey(W value) {
+ return NullWritable.get();
+ }
+
+ @Override
+ public W outputValue(W value) {
+ return value;
+ }
+
+ @Override
+ public Class<Object> getKeyClass() {
+ return (Class<Object>) (Class<?>) NullWritable.class;
+ }
+
+ @Override
+ public Class<W> getValueClass() {
+ return serializationClass;
+ }
+
+ @Override
+ public Iterable<W> convertIterableInput(Object key, Iterable<W> value) {
+ return value;
+ }
+}
\ No newline at end of file
[21/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/PObjectsIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/PObjectsIT.java b/crunch/src/it/java/org/apache/crunch/PObjectsIT.java
deleted file mode 100644
index 6ee849f..0000000
--- a/crunch/src/it/java/org/apache/crunch/PObjectsIT.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-import java.lang.Integer;
-import java.lang.Iterable;
-import java.lang.String;
-import java.util.Iterator;
-
-import org.apache.crunch.PCollection;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.materialize.pobject.PObjectImpl;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.junit.Rule;
-import org.junit.Test;
-
-@SuppressWarnings("serial")
-public class PObjectsIT {
-
- private static final Integer LINES_IN_SHAKES = 3667;
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- /**
- * A mock PObject that should map PCollections of strings to an integer count of the number of
- * elements in the underlying PCollection.
- */
- public static class MockPObjectImpl extends PObjectImpl<String, Integer> {
- private int numProcessCalls;
-
- public MockPObjectImpl(PCollection<String> collect) {
- super(collect);
- numProcessCalls = 0;
- }
-
- @Override
- public Integer process(Iterable<String> input) {
- numProcessCalls++;
- int i = 0;
- Iterator<String> itr = input.iterator();
- while (itr.hasNext()) {
- i++;
- itr.next();
- }
- return i;
- }
-
- public int getNumProcessCalls() {
- return numProcessCalls;
- }
- }
-
- @Test
- public void testMRPipeline() throws IOException {
- run(new MRPipeline(PObjectsIT.class, tmpDir.getDefaultConfiguration()));
- }
-
- @Test
- public void testInMemoryPipeline() throws IOException {
- run(MemPipeline.getInstance());
- }
-
- public void run(Pipeline pipeline) throws IOException {
- String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
- PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
- MockPObjectImpl lineCount = new MockPObjectImpl(shakespeare);
- // Get the line count once and verify it's correctness.
- assertEquals("Incorrect number of lines counted from PCollection.", LINES_IN_SHAKES,
- lineCount.getValue());
- // And do it again.
- assertEquals("Incorrect number of lines counted from PCollection.", LINES_IN_SHAKES,
- lineCount.getValue());
- // Make sure process was called only once because the PObject's value was cached after the
- // first call.
- assertEquals("Process on PObject not called exactly 1 times.", 1,
- lineCount.getNumProcessCalls());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/PTableKeyValueIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/PTableKeyValueIT.java b/crunch/src/it/java/org/apache/crunch/PTableKeyValueIT.java
deleted file mode 100644
index d56e122..0000000
--- a/crunch/src/it/java/org/apache/crunch/PTableKeyValueIT.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-
-import junit.framework.Assert;
-
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.At;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-import com.google.common.collect.Lists;
-
-@RunWith(value = Parameterized.class)
-public class PTableKeyValueIT implements Serializable {
-
- private static final long serialVersionUID = 4374227704751746689L;
-
- private transient PTypeFamily typeFamily;
- private transient MRPipeline pipeline;
- private transient String inputFile;
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Before
- public void setUp() throws IOException {
- pipeline = new MRPipeline(PTableKeyValueIT.class, tmpDir.getDefaultConfiguration());
- inputFile = tmpDir.copyResourceFileName("set1.txt");
- }
-
- @After
- public void tearDown() {
- pipeline.done();
- }
-
- public PTableKeyValueIT(PTypeFamily typeFamily) {
- this.typeFamily = typeFamily;
- }
-
- @Parameters
- public static Collection<Object[]> data() {
- Object[][] data = new Object[][] { { WritableTypeFamily.getInstance() }, { AvroTypeFamily.getInstance() } };
- return Arrays.asList(data);
- }
-
- @Test
- public void testKeysAndValues() throws Exception {
-
- PCollection<String> collection = pipeline.read(At.textFile(inputFile, typeFamily.strings()));
-
- PTable<String, String> table = collection.parallelDo(new DoFn<String, Pair<String, String>>() {
-
- @Override
- public void process(String input, Emitter<Pair<String, String>> emitter) {
- emitter.emit(Pair.of(input.toUpperCase(), input));
-
- }
- }, typeFamily.tableOf(typeFamily.strings(), typeFamily.strings()));
-
- PCollection<String> keys = table.keys();
- PCollection<String> values = table.values();
-
- ArrayList<String> keyList = Lists.newArrayList(keys.materialize().iterator());
- ArrayList<String> valueList = Lists.newArrayList(values.materialize().iterator());
-
- Assert.assertEquals(keyList.size(), valueList.size());
- for (int i = 0; i < keyList.size(); i++) {
- Assert.assertEquals(keyList.get(i), valueList.get(i).toUpperCase());
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/PageRankIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/PageRankIT.java b/crunch/src/it/java/org/apache/crunch/PageRankIT.java
deleted file mode 100644
index 6291ef8..0000000
--- a/crunch/src/it/java/org/apache/crunch/PageRankIT.java
+++ /dev/null
@@ -1,168 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.lib.Aggregate;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.PTypes;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-
-public class PageRankIT {
-
- public static class PageRankData {
- public float score;
- public float lastScore;
- public List<String> urls;
-
- public PageRankData() {
- }
-
- public PageRankData(float score, float lastScore, Iterable<String> urls) {
- this.score = score;
- this.lastScore = lastScore;
- this.urls = Lists.newArrayList(urls);
- }
-
- public PageRankData next(float newScore) {
- return new PageRankData(newScore, score, urls);
- }
-
- public float propagatedScore() {
- return score / urls.size();
- }
-
- @Override
- public String toString() {
- return score + " " + lastScore + " " + urls;
- }
- }
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testAvroReflect() throws Exception {
- PTypeFamily tf = AvroTypeFamily.getInstance();
- PType<PageRankData> prType = Avros.reflects(PageRankData.class);
- String urlInput = tmpDir.copyResourceFileName("urls.txt");
- run(new MRPipeline(PageRankIT.class, tmpDir.getDefaultConfiguration()),
- urlInput, prType, tf);
- }
-
- @Test
- public void testAvroMReflectInMemory() throws Exception {
- PTypeFamily tf = AvroTypeFamily.getInstance();
- PType<PageRankData> prType = Avros.reflects(PageRankData.class);
- String urlInput = tmpDir.copyResourceFileName("urls.txt");
- run(MemPipeline.getInstance(), urlInput, prType, tf);
- }
-
- @Test
- public void testAvroJSON() throws Exception {
- PTypeFamily tf = AvroTypeFamily.getInstance();
- PType<PageRankData> prType = PTypes.jsonString(PageRankData.class, tf);
- String urlInput = tmpDir.copyResourceFileName("urls.txt");
- run(new MRPipeline(PageRankIT.class, tmpDir.getDefaultConfiguration()),
- urlInput, prType, tf);
- }
-
- @Test
- public void testWritablesJSON() throws Exception {
- PTypeFamily tf = WritableTypeFamily.getInstance();
- PType<PageRankData> prType = PTypes.jsonString(PageRankData.class, tf);
- String urlInput = tmpDir.copyResourceFileName("urls.txt");
- run(new MRPipeline(PageRankIT.class, tmpDir.getDefaultConfiguration()),
- urlInput, prType, tf);
- }
-
- public static PTable<String, PageRankData> pageRank(PTable<String, PageRankData> input, final float d) {
- PTypeFamily ptf = input.getTypeFamily();
- PTable<String, Float> outbound = input.parallelDo(new DoFn<Pair<String, PageRankData>, Pair<String, Float>>() {
- @Override
- public void process(Pair<String, PageRankData> input, Emitter<Pair<String, Float>> emitter) {
- PageRankData prd = input.second();
- for (String link : prd.urls) {
- emitter.emit(Pair.of(link, prd.propagatedScore()));
- }
- }
- }, ptf.tableOf(ptf.strings(), ptf.floats()));
-
- return input.cogroup(outbound).parallelDo(
- new MapFn<Pair<String, Pair<Collection<PageRankData>, Collection<Float>>>, Pair<String, PageRankData>>() {
- @Override
- public Pair<String, PageRankData> map(Pair<String, Pair<Collection<PageRankData>, Collection<Float>>> input) {
- PageRankData prd = Iterables.getOnlyElement(input.second().first());
- Collection<Float> propagatedScores = input.second().second();
- float sum = 0.0f;
- for (Float s : propagatedScores) {
- sum += s;
- }
- return Pair.of(input.first(), prd.next(d + (1.0f - d) * sum));
- }
- }, input.getPTableType());
- }
-
- public static void run(Pipeline pipeline, String urlInput,
- PType<PageRankData> prType, PTypeFamily ptf) throws Exception {
- PTable<String, PageRankData> scores = pipeline.readTextFile(urlInput)
- .parallelDo(new MapFn<String, Pair<String, String>>() {
- @Override
- public Pair<String, String> map(String input) {
- String[] urls = input.split("\\t");
- return Pair.of(urls[0], urls[1]);
- }
- }, ptf.tableOf(ptf.strings(), ptf.strings())).groupByKey()
- .parallelDo(new MapFn<Pair<String, Iterable<String>>, Pair<String, PageRankData>>() {
- @Override
- public Pair<String, PageRankData> map(Pair<String, Iterable<String>> input) {
- return Pair.of(input.first(), new PageRankData(1.0f, 0.0f, input.second()));
- }
- }, ptf.tableOf(ptf.strings(), prType));
-
- Float delta = 1.0f;
- while (delta > 0.01) {
- scores = pageRank(scores, 0.5f);
- scores.materialize().iterator(); // force the write
- delta = Aggregate.max(scores.parallelDo(new MapFn<Pair<String, PageRankData>, Float>() {
- @Override
- public Float map(Pair<String, PageRankData> input) {
- PageRankData prd = input.second();
- return Math.abs(prd.score - prd.lastScore);
- }
- }, ptf.floats())).getValue();
- }
- assertEquals(0.0048, delta, 0.001);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/StageResultsCountersIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/StageResultsCountersIT.java b/crunch/src/it/java/org/apache/crunch/StageResultsCountersIT.java
deleted file mode 100644
index 19fc302..0000000
--- a/crunch/src/it/java/org/apache/crunch/StageResultsCountersIT.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static junit.framework.Assert.assertEquals;
-import static junit.framework.Assert.assertTrue;
-
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.lang.StringUtils;
-import org.apache.crunch.PipelineResult.StageResult;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.From;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.apache.hadoop.mapreduce.Counter;
-import org.junit.After;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
-
-public class StageResultsCountersIT {
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- public static HashSet<String> SPECIAL_KEYWORDS = Sets.newHashSet("AND", "OR", "NOT");
-
- public static String KEYWORDS_COUNTER_GROUP = "KEYWORDS_COUNTER_GROUP";
-
- @After
- public void after() {
- MemPipeline.clearCounters();
- }
-
- @Test
- public void testStageResultsCountersMRWritables() throws Exception {
- testSpecialKeywordCount(new MRPipeline(StageResultsCountersIT.class, tmpDir.getDefaultConfiguration()),
- WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testStageResultsCountersMRAvro() throws Exception {
- testSpecialKeywordCount(new MRPipeline(StageResultsCountersIT.class, tmpDir.getDefaultConfiguration()),
- AvroTypeFamily.getInstance());
- }
-
- @Test
- public void testStageResultsCountersMemWritables() throws Exception {
- testSpecialKeywordCount(MemPipeline.getInstance(), WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testStageResultsCountersMemAvro() throws Exception {
- testSpecialKeywordCount(MemPipeline.getInstance(), AvroTypeFamily.getInstance());
- }
-
- public void testSpecialKeywordCount(Pipeline pipeline, PTypeFamily tf) throws Exception {
-
- String rowsInputPath = tmpDir.copyResourceFileName("shakes.txt");
-
- PipelineResult result = coutSpecialKeywords(pipeline, rowsInputPath, tf);
-
- assertTrue(result.succeeded());
-
- Map<String, Long> keywordsMap = countersToMap(result.getStageResults(), KEYWORDS_COUNTER_GROUP);
-
- assertEquals(3, keywordsMap.size());
-
- assertEquals("{NOT=157, AND=596, OR=81}", keywordsMap.toString());
- }
-
- private static PipelineResult coutSpecialKeywords(Pipeline pipeline, String inputFileName, PTypeFamily tf) {
-
- pipeline.read(From.textFile(inputFileName)).parallelDo(new DoFn<String, Void>() {
-
- @Override
- public void process(String text, Emitter<Void> emitter) {
-
- if (!StringUtils.isBlank(text)) {
-
- String[] tokens = text.toUpperCase().split("\\s");
-
- for (String token : tokens) {
- if (SPECIAL_KEYWORDS.contains(token)) {
- getCounter(KEYWORDS_COUNTER_GROUP, token).increment(1);
- }
- }
- }
- }
- }, tf.nulls()).materialize(); // TODO can we avoid the materialize ?
-
- return pipeline.done();
- }
-
- private static Map<String, Long> countersToMap(List<StageResult> stages, String counterGroupName) {
-
- Map<String, Long> countersMap = Maps.newHashMap();
-
- for (StageResult sr : stages) {
- Iterator<Counter> iterator = sr.getCounters().getGroup(counterGroupName).iterator();
- while (iterator.hasNext()) {
- Counter counter = (Counter) iterator.next();
- countersMap.put(counter.getDisplayName(), counter.getValue());
- }
- }
-
- return countersMap;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/TermFrequencyIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/TermFrequencyIT.java b/crunch/src/it/java/org/apache/crunch/TermFrequencyIT.java
deleted file mode 100644
index ca66aa8..0000000
--- a/crunch/src/it/java/org/apache/crunch/TermFrequencyIT.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.Serializable;
-
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.At;
-import org.apache.crunch.io.ReadableSourceTarget;
-import org.apache.crunch.lib.Aggregate;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.Rule;
-import org.junit.Test;
-
-@SuppressWarnings("serial")
-public class TermFrequencyIT implements Serializable {
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testTermFrequencyWithNoTransform() throws IOException {
- run(new MRPipeline(TermFrequencyIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), false);
- }
-
- @Test
- public void testTermFrequencyWithTransform() throws IOException {
- run(new MRPipeline(TermFrequencyIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), true);
- }
-
- @Test
- public void testTermFrequencyNoTransformInMemory() throws IOException {
- run(MemPipeline.getInstance(), WritableTypeFamily.getInstance(), false);
- }
-
- @Test
- public void testTermFrequencyWithTransformInMemory() throws IOException {
- run(MemPipeline.getInstance(), WritableTypeFamily.getInstance(), true);
- }
-
- public void run(Pipeline pipeline, PTypeFamily typeFamily, boolean transformTF) throws IOException {
- String input = tmpDir.copyResourceFileName("docs.txt");
-
- File transformedOutput = tmpDir.getFile("transformed-output");
- File tfOutput = tmpDir.getFile("tf-output");
-
- PCollection<String> docs = pipeline.readTextFile(input);
-
- PTypeFamily ptf = docs.getTypeFamily();
-
- /*
- * Input: String Input title text
- *
- * Output: PTable<Pair<String, String>, Long> Pair<Pair<word, title>, count
- * in title>
- */
- PTable<Pair<String, String>, Long> tf = Aggregate.count(docs.parallelDo("term document frequency",
- new DoFn<String, Pair<String, String>>() {
- @Override
- public void process(String doc, Emitter<Pair<String, String>> emitter) {
- String[] kv = doc.split("\t");
- String title = kv[0];
- String text = kv[1];
- for (String word : text.split("\\W+")) {
- if (word.length() > 0) {
- Pair<String, String> pair = Pair.of(word.toLowerCase(), title);
- emitter.emit(pair);
- }
- }
- }
- }, ptf.pairs(ptf.strings(), ptf.strings())));
-
- if (transformTF) {
- /*
- * Input: Pair<Pair<String, String>, Long> Pair<Pair<word, title>, count
- * in title>
- *
- * Output: PTable<String, Pair<String, Long>> PTable<word, Pair<title,
- * count in title>>
- */
- PTable<String, Pair<String, Long>> wordDocumentCountPair = tf.parallelDo("transform wordDocumentPairCount",
- new MapFn<Pair<Pair<String, String>, Long>, Pair<String, Pair<String, Long>>>() {
- @Override
- public Pair<String, Pair<String, Long>> map(Pair<Pair<String, String>, Long> input) {
- Pair<String, String> wordDocumentPair = input.first();
- return Pair.of(wordDocumentPair.first(), Pair.of(wordDocumentPair.second(), input.second()));
- }
- }, ptf.tableOf(ptf.strings(), ptf.pairs(ptf.strings(), ptf.longs())));
-
- pipeline.writeTextFile(wordDocumentCountPair, transformedOutput.getAbsolutePath());
- }
-
- SourceTarget<String> st = At.textFile(tfOutput.getAbsolutePath());
- pipeline.write(tf, st);
-
- pipeline.run();
-
- // test the case we should see
- Iterable<String> lines = ((ReadableSourceTarget<String>) st).read(pipeline.getConfiguration());
- boolean passed = false;
- for (String line : lines) {
- if ("[well,A]\t0".equals(line)) {
- fail("Found " + line + " but well is in Document A 1 time");
- }
- if ("[well,A]\t1".equals(line)) {
- passed = true;
- }
- }
- assertTrue(passed);
- pipeline.done();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/TextPairIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/TextPairIT.java b/crunch/src/it/java/org/apache/crunch/TextPairIT.java
deleted file mode 100644
index 55d9af9..0000000
--- a/crunch/src/it/java/org/apache/crunch/TextPairIT.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.From;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.writable.Writables;
-import org.junit.Rule;
-import org.junit.Test;
-
-public class TextPairIT {
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testWritables() throws IOException {
- run(new MRPipeline(TextPairIT.class, tmpDir.getDefaultConfiguration()));
- }
-
- private static final String CANARY = "Writables.STRING_TO_TEXT";
-
- public static PCollection<Pair<String, String>> wordDuplicate(PCollection<String> words) {
- return words.parallelDo("my word duplicator", new DoFn<String, Pair<String, String>>() {
- public void process(String line, Emitter<Pair<String, String>> emitter) {
- for (String word : line.split("\\W+")) {
- if (word.length() > 0) {
- Pair<String, String> pair = Pair.of(CANARY, word);
- emitter.emit(pair);
- }
- }
- }
- }, Writables.pairs(Writables.strings(), Writables.strings()));
- }
-
- public void run(Pipeline pipeline) throws IOException {
- String input = tmpDir.copyResourceFileName("shakes.txt");
-
- PCollection<String> shakespeare = pipeline.read(From.textFile(input));
- Iterable<Pair<String, String>> lines = pipeline.materialize(wordDuplicate(shakespeare));
- boolean passed = false;
- for (Pair<String, String> line : lines) {
- if (line.first().contains(CANARY)) {
- passed = true;
- break;
- }
- }
-
- pipeline.done();
- assertTrue(passed);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/TfIdfIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/TfIdfIT.java b/crunch/src/it/java/org/apache/crunch/TfIdfIT.java
deleted file mode 100644
index 218f538..0000000
--- a/crunch/src/it/java/org/apache/crunch/TfIdfIT.java
+++ /dev/null
@@ -1,224 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.Serializable;
-import java.nio.charset.Charset;
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.crunch.fn.MapKeysFn;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.seq.SeqFileSourceTarget;
-import org.apache.crunch.lib.Aggregate;
-import org.apache.crunch.lib.Join;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.apache.hadoop.fs.Path;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-import com.google.common.io.Files;
-
-@SuppressWarnings("serial")
-public class TfIdfIT implements Serializable {
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
- // total number of documents, should calculate
- protected static final double N = 2;
-
- @Test
- public void testWritablesSingleRun() throws IOException {
- run(new MRPipeline(TfIdfIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), true);
- }
-
- @Test
- public void testWritablesMultiRun() throws IOException {
- run(new MRPipeline(TfIdfIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), false);
- }
-
- /**
- * This method should generate a TF-IDF score for the input.
- */
- public PTable<String, Collection<Pair<String, Double>>> generateTFIDF(PCollection<String> docs, Path termFreqPath,
- PTypeFamily ptf) throws IOException {
-
- /*
- * Input: String Input title text
- *
- * Output: PTable<Pair<String, String>, Long> Pair<Pair<word, title>, count
- * in title>
- */
- PTable<Pair<String, String>, Long> tf = Aggregate.count(docs.parallelDo("term document frequency",
- new DoFn<String, Pair<String, String>>() {
- @Override
- public void process(String doc, Emitter<Pair<String, String>> emitter) {
- String[] kv = doc.split("\t");
- String title = kv[0];
- String text = kv[1];
- for (String word : text.split("\\W+")) {
- if (word.length() > 0) {
- Pair<String, String> pair = Pair.of(word.toLowerCase(), title);
- emitter.emit(pair);
- }
- }
- }
- }, ptf.pairs(ptf.strings(), ptf.strings())));
-
- tf.write(new SeqFileSourceTarget<Pair<Pair<String, String>, Long>>(termFreqPath, tf.getPType()));
-
- /*
- * Input: Pair<Pair<String, String>, Long> Pair<Pair<word, title>, count in
- * title>
- *
- * Output: PTable<String, Long> PTable<word, # of docs containing word>
- */
- PTable<String, Long> n = Aggregate.count(tf.parallelDo("little n (# of docs contain word)",
- new DoFn<Pair<Pair<String, String>, Long>, String>() {
- @Override
- public void process(Pair<Pair<String, String>, Long> input, Emitter<String> emitter) {
- emitter.emit(input.first().first());
- }
- }, ptf.strings()));
-
- /*
- * Input: Pair<Pair<String, String>, Long> Pair<Pair<word, title>, count in
- * title>
- *
- * Output: PTable<String, Pair<String, Long>> PTable<word, Pair<title, count
- * in title>>
- */
- PTable<String, Collection<Pair<String, Long>>> wordDocumentCountPair = tf.parallelDo(
- "transform wordDocumentPairCount",
- new DoFn<Pair<Pair<String, String>, Long>, Pair<String, Collection<Pair<String, Long>>>>() {
- Collection<Pair<String, Long>> buffer;
- String key;
-
- @Override
- public void process(Pair<Pair<String, String>, Long> input,
- Emitter<Pair<String, Collection<Pair<String, Long>>>> emitter) {
- Pair<String, String> wordDocumentPair = input.first();
- if (!wordDocumentPair.first().equals(key)) {
- flush(emitter);
- key = wordDocumentPair.first();
- buffer = Lists.newArrayList();
- }
- buffer.add(Pair.of(wordDocumentPair.second(), input.second()));
- }
-
- protected void flush(Emitter<Pair<String, Collection<Pair<String, Long>>>> emitter) {
- if (buffer != null) {
- emitter.emit(Pair.of(key, buffer));
- buffer = null;
- }
- }
-
- @Override
- public void cleanup(Emitter<Pair<String, Collection<Pair<String, Long>>>> emitter) {
- flush(emitter);
- }
- }, ptf.tableOf(ptf.strings(), ptf.collections(ptf.pairs(ptf.strings(), ptf.longs()))));
-
- PTable<String, Pair<Long, Collection<Pair<String, Long>>>> joinedResults = Join.join(n, wordDocumentCountPair);
-
- /*
- * Input: Pair<String, Pair<Long, Collection<Pair<String, Long>>> Pair<word,
- * Pair<# of docs containing word, Collection<Pair<title, term frequency>>>
- *
- * Output: Pair<String, Collection<Pair<String, Double>>> Pair<word,
- * Collection<Pair<title, tfidf>>>
- */
- return joinedResults
- .parallelDo(
- "calculate tfidf",
- new MapFn<Pair<String, Pair<Long, Collection<Pair<String, Long>>>>, Pair<String, Collection<Pair<String, Double>>>>() {
- @Override
- public Pair<String, Collection<Pair<String, Double>>> map(
- Pair<String, Pair<Long, Collection<Pair<String, Long>>>> input) {
- Collection<Pair<String, Double>> tfidfs = Lists.newArrayList();
- String word = input.first();
- double n = input.second().first();
- double idf = Math.log(N / n);
- for (Pair<String, Long> tf : input.second().second()) {
- double tfidf = tf.second() * idf;
- tfidfs.add(Pair.of(tf.first(), tfidf));
- }
- return Pair.of(word, tfidfs);
- }
-
- }, ptf.tableOf(ptf.strings(), ptf.collections(ptf.pairs(ptf.strings(), ptf.doubles()))));
- }
-
- public void run(Pipeline pipeline, PTypeFamily typeFamily, boolean singleRun) throws IOException {
- String inputFile = tmpDir.copyResourceFileName("docs.txt");
- String outputPath1 = tmpDir.getFileName("output1");
- String outputPath2 = tmpDir.getFileName("output2");
-
- Path tfPath = tmpDir.getPath("termfreq");
-
- PCollection<String> docs = pipeline.readTextFile(inputFile);
-
- PTable<String, Collection<Pair<String, Double>>> results = generateTFIDF(docs, tfPath, typeFamily);
- pipeline.writeTextFile(results, outputPath1);
- if (!singleRun) {
- pipeline.run();
- }
-
- PTable<String, Collection<Pair<String, Double>>> uppercased = results.parallelDo(
- new MapKeysFn<String, String, Collection<Pair<String, Double>>>() {
- @Override
- public String map(String k1) {
- return k1.toUpperCase();
- }
- }, results.getPTableType());
- pipeline.writeTextFile(uppercased, outputPath2);
- pipeline.done();
-
- // Check the lowercase version...
- File outputFile = new File(outputPath1, "part-r-00000");
- List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
- boolean passed = false;
- for (String line : lines) {
- if (line.startsWith("[the") && line.contains("B,0.6931471805599453")) {
- passed = true;
- break;
- }
- }
- assertTrue(passed);
-
- // ...and the uppercase version
- outputFile = new File(outputPath2, "part-r-00000");
- lines = Files.readLines(outputFile, Charset.defaultCharset());
- passed = false;
- for (String line : lines) {
- if (line.startsWith("[THE") && line.contains("B,0.6931471805599453")) {
- passed = true;
- break;
- }
- }
- assertTrue(passed);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/TupleNClassCastBugIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/TupleNClassCastBugIT.java b/crunch/src/it/java/org/apache/crunch/TupleNClassCastBugIT.java
deleted file mode 100644
index e49f4d5..0000000
--- a/crunch/src/it/java/org/apache/crunch/TupleNClassCastBugIT.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.util.List;
-
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.io.Files;
-
-
-public class TupleNClassCastBugIT {
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- public static PCollection<TupleN> mapGroupDo(PCollection<String> lines, PTypeFamily ptf) {
- PTable<String, TupleN> mapped = lines.parallelDo(new MapFn<String, Pair<String, TupleN>>() {
-
- @Override
- public Pair<String, TupleN> map(String line) {
- String[] columns = line.split("\\t");
- String docId = columns[0];
- String docLine = columns[1];
- return Pair.of(docId, new TupleN(docId, docLine));
- }
- }, ptf.tableOf(ptf.strings(), ptf.tuples(ptf.strings(), ptf.strings())));
- return mapped.groupByKey().parallelDo(new DoFn<Pair<String, Iterable<TupleN>>, TupleN>() {
- @Override
- public void process(Pair<String, Iterable<TupleN>> input, Emitter<TupleN> tupleNEmitter) {
- for (TupleN tuple : input.second()) {
- tupleNEmitter.emit(tuple);
- }
- }
- }, ptf.tuples(ptf.strings(), ptf.strings()));
- }
-
- @Test
- public void testWritables() throws IOException {
- run(new MRPipeline(TupleNClassCastBugIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testAvro() throws IOException {
- run(new MRPipeline(TupleNClassCastBugIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
- }
-
- public void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
- String inputPath = tmpDir.copyResourceFileName("docs.txt");
- String outputPath = tmpDir.getFileName("output");
-
- PCollection<String> docLines = pipeline.readTextFile(inputPath);
- pipeline.writeTextFile(mapGroupDo(docLines, typeFamily), outputPath);
- pipeline.done();
-
- // *** We are not directly testing the output, we are looking for a
- // ClassCastException
- // *** which is thrown in a different thread during the reduce phase. If all
- // is well
- // *** the file will exist and have six lines. Otherwise the bug is present.
- File outputFile = new File(outputPath, "part-r-00000");
- List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
- int lineCount = 0;
- for (String line : lines) {
- lineCount++;
- }
- assertEquals(6, lineCount);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/UnionFromSameSourceIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/UnionFromSameSourceIT.java b/crunch/src/it/java/org/apache/crunch/UnionFromSameSourceIT.java
deleted file mode 100644
index 501a944..0000000
--- a/crunch/src/it/java/org/apache/crunch/UnionFromSameSourceIT.java
+++ /dev/null
@@ -1,132 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-
-import org.apache.crunch.fn.IdentityFn;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.writable.Writables;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-
-/**
- * Collection of tests re-using the same PCollection in various unions.
- */
-public class UnionFromSameSourceIT {
-
- private static final int NUM_ELEMENTS = 4;
-
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
- private Pipeline pipeline;
- private PType<String> elementType = Writables.strings();
- private PTableType<String, String> tableType = Writables.tableOf(Writables.strings(),
- Writables.strings());
-
- @Before
- public void setUp() {
- pipeline = new MRPipeline(UnionFromSameSourceIT.class, tmpDir.getDefaultConfiguration());
- }
-
- @Test
- public void testUnion_SingleRead() throws IOException {
- PCollection<String> strings = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt"));
- PCollection<String> union = strings.union(strings.parallelDo(IdentityFn.<String> getInstance(),
- strings.getPType()));
-
- assertEquals(NUM_ELEMENTS * 2, getCount(union));
- }
-
- @Test
- public void testUnion_TwoReads() throws IOException {
- PCollection<String> stringsA = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt"));
- PCollection<String> stringsB = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt"));
-
- PCollection<String> union = stringsA.union(stringsB);
-
- assertEquals(NUM_ELEMENTS * 2, getCount(union));
- }
-
- @Test
- public void testDoubleUnion_EndingWithGBK() throws IOException {
- runDoubleUnionPipeline(true);
- }
-
- @Test
- public void testDoubleUnion_EndingWithoutGBK() throws IOException {
- runDoubleUnionPipeline(false);
- }
-
- private void runDoubleUnionPipeline(boolean endWithGBK) throws IOException {
- PCollection<String> strings = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt"));
- PTable<String, String> tableA = strings.parallelDo("to table A", new ToTableFn(), tableType);
- PTable<String, String> tableB = strings.parallelDo("to table B", new ToTableFn(), tableType);
-
- PGroupedTable<String, String> groupedTable = tableA.union(tableB).groupByKey();
- PCollection<String> ungrouped = groupedTable.parallelDo("ungroup before union",
- new FromGroupedTableFn(), elementType).union(
- strings.parallelDo("fake id", IdentityFn.<String> getInstance(), elementType));
-
- PTable<String, String> table = ungrouped.parallelDo("union back to table", new ToTableFn(),
- tableType);
-
- if (endWithGBK) {
- table = table.groupByKey().ungroup();
- }
-
- assertEquals(3 * NUM_ELEMENTS, getCount(table));
- }
-
- private int getCount(PCollection<?> pcollection) {
- int cnt = 0;
- for (Object v : pcollection.materialize()) {
- cnt++;
- }
- return cnt;
- }
-
- private static class ToTableFn extends MapFn<String, Pair<String, String>> {
-
- @Override
- public Pair<String, String> map(String input) {
- return Pair.of(input, input);
- }
-
- }
-
- private static class FromGroupedTableFn extends DoFn<Pair<String, Iterable<String>>, String> {
-
- @Override
- public void process(Pair<String, Iterable<String>> input, Emitter<String> emitter) {
- for (String value : input.second()) {
- emitter.emit(value);
- }
- }
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/UnionIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/UnionIT.java b/crunch/src/it/java/org/apache/crunch/UnionIT.java
deleted file mode 100644
index 1c60a1b..0000000
--- a/crunch/src/it/java/org/apache/crunch/UnionIT.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.hamcrest.Matchers.is;
-import static org.junit.Assert.assertThat;
-
-import java.io.IOException;
-import java.util.Map;
-
-import org.apache.crunch.fn.Aggregators;
-import org.apache.crunch.fn.IdentityFn;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.test.Tests;
-import org.apache.crunch.types.avro.Avros;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.ImmutableMultiset;
-
-
-public class UnionIT {
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
- private MRPipeline pipeline;
- private PCollection<String> words1;
- private PCollection<String> words2;
-
- @Before
- public void setUp() throws IOException {
- pipeline = new MRPipeline(UnionIT.class, tmpDir.getDefaultConfiguration());
- words1 = pipeline.readTextFile(tmpDir.copyResourceFileName(Tests.resource(this, "src1.txt")));
- words2 = pipeline.readTextFile(tmpDir.copyResourceFileName(Tests.resource(this, "src2.txt")));
- }
-
- @After
- public void tearDown() {
- pipeline.done();
- }
-
- @Test
- public void testUnion() throws Exception {
- IdentityFn<String> identity = IdentityFn.getInstance();
- words1 = words1.parallelDo(identity, Avros.strings());
- words2 = words2.parallelDo(identity, Avros.strings());
-
- PCollection<String> union = words1.union(words2);
-
- ImmutableMultiset<String> actual = ImmutableMultiset.copyOf(union.materialize());
- assertThat(actual.elementSet().size(), is(3));
- assertThat(actual.count("a1"), is(4));
- assertThat(actual.count("b2"), is(2));
- assertThat(actual.count("c3"), is(2));
- }
-
- @Test
- public void testTableUnion() throws IOException {
- PTable<String, String> words1ByFirstLetter = byFirstLetter(words1);
- PTable<String, String> words2ByFirstLetter = byFirstLetter(words2);
-
- PTable<String, String> union = words1ByFirstLetter.union(words2ByFirstLetter);
-
- ImmutableMultiset<Pair<String, String>> actual = ImmutableMultiset.copyOf(union.materialize());
-
- assertThat(actual.elementSet().size(), is(3));
- assertThat(actual.count(Pair.of("a", "1")), is(4));
- assertThat(actual.count(Pair.of("b", "2")), is(2));
- assertThat(actual.count(Pair.of("c", "3")), is(2));
- }
-
- @Test
- public void testUnionThenGroupByKey() throws IOException {
- PCollection<String> union = words1.union(words2);
-
- PGroupedTable<String, String> grouped = byFirstLetter(union).groupByKey();
-
- Map<String, String> actual = grouped.combineValues(Aggregators.STRING_CONCAT("", true))
- .materializeToMap();
-
- Map<String, String> expected = ImmutableMap.of("a", "1111", "b", "22", "c", "33");
- assertThat(actual, is(expected));
- }
-
- @Test
- public void testTableUnionThenGroupByKey() throws IOException {
- PTable<String, String> words1ByFirstLetter = byFirstLetter(words1);
- PTable<String, String> words2ByFirstLetter = byFirstLetter(words2);
-
- PTable<String, String> union = words1ByFirstLetter.union(words2ByFirstLetter);
-
- PGroupedTable<String, String> grouped = union.groupByKey();
-
- Map<String, String> actual = grouped.combineValues(Aggregators.STRING_CONCAT("", true))
- .materializeToMap();
-
- Map<String, String> expected = ImmutableMap.of("a", "1111", "b", "22", "c", "33");
- assertThat(actual, is(expected));
- }
-
-
- private static PTable<String, String> byFirstLetter(PCollection<String> values) {
- return values.parallelDo("byFirstLetter", new FirstLetterKeyFn(),
- Avros.tableOf(Avros.strings(), Avros.strings()));
- }
-
- private static class FirstLetterKeyFn extends DoFn<String, Pair<String, String>> {
- @Override
- public void process(String input, Emitter<Pair<String, String>> emitter) {
- if (input.length() > 1) {
- emitter.emit(Pair.of(input.substring(0, 1), input.substring(1)));
- }
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/UnionResultsIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/UnionResultsIT.java b/crunch/src/it/java/org/apache/crunch/UnionResultsIT.java
deleted file mode 100644
index df0511a..0000000
--- a/crunch/src/it/java/org/apache/crunch/UnionResultsIT.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.At;
-import org.apache.crunch.test.CrunchTestSupport;
-import org.apache.crunch.types.writable.Writables;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-
-public class UnionResultsIT extends CrunchTestSupport implements Serializable {
-
- static class StringLengthMapFn extends MapFn<String, Pair<String, Long>> {
-
- @Override
- public Pair<String, Long> map(String input) {
- return new Pair<String, Long>(input, 10L);
- }
- }
-
-
- /**
- * Tests combining a GBK output with a map-only job output into a single
- * unioned collection.
- */
- @Test
- public void testUnionOfGroupedOutputAndNonGroupedOutput() throws IOException {
- String inputPath = tempDir.copyResourceFileName("set1.txt");
- String inputPath2 = tempDir.copyResourceFileName("set2.txt");
-
- Pipeline pipeline = new MRPipeline(UnionResultsIT.class);
-
- PCollection<String> set1Lines = pipeline.read(At.textFile(inputPath, Writables.strings()));
- PCollection<Pair<String, Long>> set1Lengths = set1Lines.parallelDo(new StringLengthMapFn(),
- Writables.pairs(Writables.strings(), Writables.longs()));
- PCollection<Pair<String, Long>> set2Counts = pipeline.read(At.textFile(inputPath2, Writables.strings())).count();
-
- PCollection<Pair<String, Long>> union = set1Lengths.union(set2Counts);
-
- List<Pair<String, Long>> unionValues = Lists.newArrayList(union.materialize());
- assertEquals(7, unionValues.size());
-
- Set<Pair<String, Long>> expectedPairs = Sets.newHashSet();
- expectedPairs.add(Pair.of("b", 10L));
- expectedPairs.add(Pair.of("c", 10L));
- expectedPairs.add(Pair.of("a", 10L));
- expectedPairs.add(Pair.of("e", 10L));
- expectedPairs.add(Pair.of("a", 1L));
- expectedPairs.add(Pair.of("c", 1L));
- expectedPairs.add(Pair.of("d", 1L));
-
- assertEquals(expectedPairs, Sets.newHashSet(unionValues));
-
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/WordCountIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/WordCountIT.java b/crunch/src/it/java/org/apache/crunch/WordCountIT.java
deleted file mode 100644
index c646663..0000000
--- a/crunch/src/it/java/org/apache/crunch/WordCountIT.java
+++ /dev/null
@@ -1,171 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.util.List;
-
-import org.apache.crunch.fn.Aggregators;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.At;
-import org.apache.crunch.io.To;
-import org.apache.crunch.lib.Aggregate;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-import com.google.common.io.Files;
-
-public class WordCountIT {
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- enum WordCountStats {
- ANDS
- };
-
- public static PTable<String, Long> wordCount(PCollection<String> words, PTypeFamily typeFamily) {
- return Aggregate.count(words.parallelDo(new DoFn<String, String>() {
-
- @Override
- public void process(String line, Emitter<String> emitter) {
- for (String word : line.split("\\s+")) {
- emitter.emit(word);
- if ("and".equals(word)) {
- increment(WordCountStats.ANDS);
- }
- }
- }
- }, typeFamily.strings()));
- }
-
- public static PTable<String, Long> substr(PTable<String, Long> ptable) {
- return ptable.parallelDo(new DoFn<Pair<String, Long>, Pair<String, Long>>() {
-
- public void process(Pair<String, Long> input, Emitter<Pair<String, Long>> emitter) {
- if (input.first().length() > 0) {
- emitter.emit(Pair.of(input.first().substring(0, 1), input.second()));
- }
- }
- }, ptable.getPTableType());
- }
-
- private boolean runSecond = false;
- private boolean useToOutput = false;
-
- @Test
- public void testWritables() throws IOException {
- run(new MRPipeline(WordCountIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testWritablesWithSecond() throws IOException {
- runSecond = true;
- run(new MRPipeline(WordCountIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testWritablesWithSecondUseToOutput() throws IOException {
- runSecond = true;
- useToOutput = true;
- run(new MRPipeline(WordCountIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testAvro() throws IOException {
- run(new MRPipeline(WordCountIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
- }
-
- @Test
- public void testAvroWithSecond() throws IOException {
- runSecond = true;
- run(new MRPipeline(WordCountIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
- }
-
- @Test
- public void testWithTopWritable() throws IOException {
- runWithTop(WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testWithTopAvro() throws IOException {
- runWithTop(AvroTypeFamily.getInstance());
- }
-
- public void runWithTop(PTypeFamily tf) throws IOException {
- Pipeline pipeline = new MRPipeline(WordCountIT.class, tmpDir.getDefaultConfiguration());
- String inputPath = tmpDir.copyResourceFileName("shakes.txt");
-
- PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, tf.strings()));
- PTable<String, Long> wordCount = wordCount(shakespeare, tf);
- List<Pair<String, Long>> top5 = Lists.newArrayList(Aggregate.top(wordCount, 5, true).materialize());
- assertEquals(
- ImmutableList.of(Pair.of("", 1470L), Pair.of("the", 620L), Pair.of("and", 427L), Pair.of("of", 396L),
- Pair.of("to", 367L)), top5);
- }
-
- public void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
- String inputPath = tmpDir.copyResourceFileName("shakes.txt");
- String outputPath = tmpDir.getFileName("output");
-
- PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, typeFamily.strings()));
- PTable<String, Long> wordCount = wordCount(shakespeare, typeFamily);
- if (useToOutput) {
- wordCount.write(To.textFile(outputPath));
- } else {
- pipeline.writeTextFile(wordCount, outputPath);
- }
-
- if (runSecond) {
- String substrPath = tmpDir.getFileName("substr");
- PTable<String, Long> we = substr(wordCount).groupByKey().combineValues(Aggregators.SUM_LONGS());
- pipeline.writeTextFile(we, substrPath);
- }
- PipelineResult res = pipeline.done();
- assertTrue(res.succeeded());
- List<PipelineResult.StageResult> stageResults = res.getStageResults();
- if (runSecond) {
- assertEquals(2, stageResults.size());
- } else {
- assertEquals(1, stageResults.size());
- assertEquals(427, stageResults.get(0).getCounterValue(WordCountStats.ANDS));
- }
-
- File outputFile = new File(outputPath, "part-r-00000");
- List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
- boolean passed = false;
- for (String line : lines) {
- if (line.startsWith("Macbeth\t28") || line.startsWith("[Macbeth,28]")) {
- passed = true;
- break;
- }
- }
- assertTrue(passed);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/fn/AggregatorsIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/fn/AggregatorsIT.java b/crunch/src/it/java/org/apache/crunch/fn/AggregatorsIT.java
deleted file mode 100644
index c9584a1..0000000
--- a/crunch/src/it/java/org/apache/crunch/fn/AggregatorsIT.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import static org.apache.crunch.fn.Aggregators.SUM_INTS;
-import static org.apache.crunch.fn.Aggregators.pairAggregator;
-import static org.apache.crunch.types.writable.Writables.ints;
-import static org.apache.crunch.types.writable.Writables.pairs;
-import static org.apache.crunch.types.writable.Writables.strings;
-import static org.apache.crunch.types.writable.Writables.tableOf;
-import static org.hamcrest.Matchers.is;
-import static org.junit.Assert.assertThat;
-
-import java.util.Collection;
-import java.util.Map;
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.test.Tests;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-
-@RunWith(Parameterized.class)
-public class AggregatorsIT {
- private Pipeline pipeline;
-
- @Parameters
- public static Collection<Object[]> params() {
- return Tests.pipelinesParams(AggregatorsIT.class);
- }
-
- public AggregatorsIT(Pipeline pipeline) {
- this.pipeline = pipeline;
- }
-
- @Test
- public void testPairAggregator() {
- PCollection<String> lines = pipeline.readTextFile(Tests.pathTo(this, "ints.txt"));
-
- PTable<String, Pair<Integer, Integer>> table = lines.parallelDo(new SplitLine(),
- tableOf(strings(), pairs(ints(), ints())));
-
- PTable<String, Pair<Integer, Integer>> combinedTable = table.groupByKey().combineValues(
- pairAggregator(SUM_INTS(), SUM_INTS()));
-
- Map<String, Pair<Integer, Integer>> result = combinedTable.asMap().getValue();
-
- assertThat(result.size(), is(2));
- assertThat(result.get("a"), is(Pair.of(9, 12)));
- assertThat(result.get("b"), is(Pair.of(11, 13)));
- }
-
- private static final class SplitLine extends MapFn<String, Pair<String, Pair<Integer, Integer>>> {
- @Override
- public Pair<String, Pair<Integer, Integer>> map(String input) {
- String[] split = input.split("\t");
- return Pair.of(split[0],
- Pair.of(Integer.parseInt(split[1]), Integer.parseInt(split[2])));
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/impl/mem/MemPipelineFileWritingIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/impl/mem/MemPipelineFileWritingIT.java b/crunch/src/it/java/org/apache/crunch/impl/mem/MemPipelineFileWritingIT.java
deleted file mode 100644
index 976a43e..0000000
--- a/crunch/src/it/java/org/apache/crunch/impl/mem/MemPipelineFileWritingIT.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mem;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.util.List;
-
-import org.apache.crunch.PCollection;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.base.Charsets;
-import com.google.common.collect.ImmutableList;
-import com.google.common.io.Files;
-
-public class MemPipelineFileWritingIT {
- @Rule
- public TemporaryPath baseTmpDir = TemporaryPaths.create();
-
- @Test
- public void testMemPipelineFileWriter() throws Exception {
- File tmpDir = baseTmpDir.getFile("mempipe");
- Pipeline p = MemPipeline.getInstance();
- PCollection<String> lines = MemPipeline.collectionOf("hello", "world");
- p.writeTextFile(lines, tmpDir.toString());
- p.done();
- assertTrue(tmpDir.exists());
- File[] files = tmpDir.listFiles();
- assertTrue(files != null && files.length > 0);
- for (File f : files) {
- if (!f.getName().startsWith(".")) {
- List<String> txt = Files.readLines(f, Charsets.UTF_8);
- assertEquals(ImmutableList.of("hello", "world"), txt);
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/impl/mr/collect/UnionCollectionIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/impl/mr/collect/UnionCollectionIT.java b/crunch/src/it/java/org/apache/crunch/impl/mr/collect/UnionCollectionIT.java
deleted file mode 100644
index f9f73b2..0000000
--- a/crunch/src/it/java/org/apache/crunch/impl/mr/collect/UnionCollectionIT.java
+++ /dev/null
@@ -1,154 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.collect;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTableKeyValueIT;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.At;
-import org.apache.crunch.io.To;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-
-import com.google.common.collect.Lists;
-
-@RunWith(value = Parameterized.class)
-public class UnionCollectionIT {
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- private static final Log LOG = LogFactory.getLog(UnionCollectionIT.class);
-
- private PTypeFamily typeFamily;
- private Pipeline pipeline;
- private PCollection<String> union;
-
- private ArrayList<String> EXPECTED = Lists.newArrayList("a", "a", "b", "c", "c", "d", "e");
-
- private Class pipelineClass;
-
- @Before
- @SuppressWarnings("unchecked")
- public void setUp() throws IOException {
- String inputFile1 = tmpDir.copyResourceFileName("set1.txt");
- String inputFile2 = tmpDir.copyResourceFileName("set2.txt");
- if (pipelineClass == null) {
- pipeline = MemPipeline.getInstance();
- } else {
- pipeline = new MRPipeline(pipelineClass, tmpDir.getDefaultConfiguration());
- }
- PCollection<String> firstCollection = pipeline.read(At.textFile(inputFile1, typeFamily.strings()));
- PCollection<String> secondCollection = pipeline.read(At.textFile(inputFile2, typeFamily.strings()));
-
- LOG.info("Test fixture: [" + pipeline.getClass().getSimpleName() + " : " + typeFamily.getClass().getSimpleName()
- + "] First: " + Lists.newArrayList(firstCollection.materialize().iterator()) + ", Second: "
- + Lists.newArrayList(secondCollection.materialize().iterator()));
-
- union = secondCollection.union(firstCollection);
- }
-
- @Parameters
- public static Collection<Object[]> data() throws IOException {
- Object[][] data = new Object[][] { { WritableTypeFamily.getInstance(), PTableKeyValueIT.class },
- { WritableTypeFamily.getInstance(), null }, { AvroTypeFamily.getInstance(), PTableKeyValueIT.class },
- { AvroTypeFamily.getInstance(), null } };
- return Arrays.asList(data);
- }
-
- public UnionCollectionIT(PTypeFamily typeFamily, Class pipelineClass) {
- this.typeFamily = typeFamily;
- this.pipelineClass = pipelineClass;
- }
-
- @Test
- public void unionMaterializeShouldNotThrowNPE() throws Exception {
- checkMaterialized(union.materialize());
- checkMaterialized(pipeline.materialize(union));
- }
-
- private void checkMaterialized(Iterable<String> materialized) {
- List<String> materializedValues = Lists.newArrayList(materialized.iterator());
- Collections.sort(materializedValues);
- LOG.info("Materialized union: " + materializedValues);
- assertEquals(EXPECTED, materializedValues);
- }
-
- @Test
- public void unionWriteShouldNotThrowNPE() throws IOException {
- String outputPath1 = tmpDir.getFileName("output1");
- String outputPath2 = tmpDir.getFileName("output2");
- String outputPath3 = tmpDir.getFileName("output3");
-
- if (typeFamily == AvroTypeFamily.getInstance()) {
- union.write(To.avroFile(outputPath1));
- pipeline.write(union, To.avroFile(outputPath2));
-
- pipeline.run();
-
- checkFileContents(outputPath1);
- checkFileContents(outputPath2);
-
- } else {
-
- union.write(To.textFile(outputPath1));
- pipeline.write(union, To.textFile(outputPath2));
- pipeline.writeTextFile(union, outputPath3);
-
- pipeline.run();
-
- checkFileContents(outputPath1);
- checkFileContents(outputPath2);
- checkFileContents(outputPath3);
- }
- }
-
- private void checkFileContents(String filePath) throws IOException {
-
- List<String> fileContentValues = (typeFamily != AvroTypeFamily.getInstance() || !(pipeline instanceof MRPipeline)) ? Lists
- .newArrayList(pipeline.read(At.textFile(filePath, typeFamily.strings())).materialize().iterator()) : Lists
- .newArrayList(pipeline.read(At.avroFile(filePath, Avros.strings())).materialize().iterator());
-
- Collections.sort(fileContentValues);
-
- LOG.info("Saved Union: " + fileContentValues);
- assertEquals(EXPECTED, fileContentValues);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/io/CompositePathIterableIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/io/CompositePathIterableIT.java b/crunch/src/it/java/org/apache/crunch/io/CompositePathIterableIT.java
deleted file mode 100644
index 08d226d..0000000
--- a/crunch/src/it/java/org/apache/crunch/io/CompositePathIterableIT.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.crunch.io.text.TextFileReaderFactory;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.writable.Writables;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocalFileSystem;
-import org.apache.hadoop.fs.Path;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class CompositePathIterableIT {
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testCreate_FilePresent() throws IOException {
- String inputFilePath = tmpDir.copyResourceFileName("set1.txt");
- Configuration conf = new Configuration();
- LocalFileSystem local = FileSystem.getLocal(conf);
-
- Iterable<String> iterable = CompositePathIterable.create(local, new Path(inputFilePath),
- new TextFileReaderFactory<String>(Writables.strings()));
-
- assertEquals(Lists.newArrayList("b", "c", "a", "e"), Lists.newArrayList(iterable));
-
- }
-
- @Test
- public void testCreate_DirectoryPresentButNoFiles() throws IOException {
- Path emptyInputDir = tmpDir.getRootPath();
-
- Configuration conf = new Configuration();
- LocalFileSystem local = FileSystem.getLocal(conf);
-
- Iterable<String> iterable = CompositePathIterable.create(local, emptyInputDir,
- new TextFileReaderFactory<String>(Writables.strings()));
-
- assertTrue(Lists.newArrayList(iterable).isEmpty());
- }
-
- @Test(expected = IOException.class)
- public void testCreate_DirectoryNotPresent() throws IOException {
- File nonExistentDir = tmpDir.getFile("not-there");
-
- // Sanity check
- assertFalse(nonExistentDir.exists());
-
- Configuration conf = new Configuration();
- LocalFileSystem local = FileSystem.getLocal(conf);
-
- CompositePathIterable.create(local, new Path(nonExistentDir.getAbsolutePath()), new TextFileReaderFactory<String>(
- Writables.strings()));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/io/NLineInputIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/io/NLineInputIT.java b/crunch/src/it/java/org/apache/crunch/io/NLineInputIT.java
deleted file mode 100644
index 52b8ff5..0000000
--- a/crunch/src/it/java/org/apache/crunch/io/NLineInputIT.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.text.NLineFileSource;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.writable.Writables;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.hadoop.conf.Configuration;
-import org.junit.Rule;
-import org.junit.Test;
-
-public class NLineInputIT {
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testNLine() throws Exception {
- String urlsInputPath = tmpDir.copyResourceFileName("urls.txt");
- Configuration conf = new Configuration(tmpDir.getDefaultConfiguration());
- conf.setInt("io.sort.mb", 10);
- Pipeline pipeline = new MRPipeline(NLineInputIT.class, conf);
- PCollection<String> urls = pipeline.read(new NLineFileSource<String>(urlsInputPath,
- Writables.strings(), 2));
- assertEquals(new Integer(2),
- urls.parallelDo(new LineCountFn(), Avros.ints()).max().getValue());
- }
-
- private static class LineCountFn extends DoFn<String, Integer> {
-
- private int lineCount = 0;
-
- @Override
- public void initialize() {
- this.lineCount = 0;
- }
-
- @Override
- public void process(String input, Emitter<Integer> emitter) {
- lineCount++;
- }
-
- @Override
- public void cleanup(Emitter<Integer> emitter) {
- emitter.emit(lineCount);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/io/TextFileTableIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/io/TextFileTableIT.java b/crunch/src/it/java/org/apache/crunch/io/TextFileTableIT.java
deleted file mode 100644
index bddc0b5..0000000
--- a/crunch/src/it/java/org/apache/crunch/io/TextFileTableIT.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import static org.apache.crunch.types.writable.Writables.*;
-import static org.junit.Assert.assertEquals;
-
-import java.util.Set;
-
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.text.TextFileTableSource;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableSet;
-
-/**
- *
- */
-public class TextFileTableIT {
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testTextFileTable() throws Exception {
- String urlsFile = tmpDir.copyResourceFileName("urls.txt");
- Pipeline pipeline = new MRPipeline(TextFileTableIT.class, tmpDir.getDefaultConfiguration());
- PTable<String, String> urls = pipeline.read(
- new TextFileTableSource<String, String>(urlsFile, tableOf(strings(), strings())));
- Set<Pair<String, Long>> cnts = ImmutableSet.copyOf(urls.keys().count().materialize());
- assertEquals(ImmutableSet.of(Pair.of("www.A.com", 4L), Pair.of("www.B.com", 2L),
- Pair.of("www.C.com", 1L), Pair.of("www.D.com", 1L), Pair.of("www.E.com", 1L),
- Pair.of("www.F.com", 2L)), cnts);
- }
-}
[37/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/shakes.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/shakes.txt b/crunch-core/src/it/resources/shakes.txt
new file mode 100644
index 0000000..63acf18
--- /dev/null
+++ b/crunch-core/src/it/resources/shakes.txt
@@ -0,0 +1,3667 @@
+***The Project Gutenberg's Etext of Shakespeare's First Folio***
+********************The Tragedie of Macbeth*********************
+
+This is our 3rd edition of most of these plays. See the index.
+
+
+Copyright laws are changing all over the world, be sure to check
+the copyright laws for your country before posting these files!!
+
+Please take a look at the important information in this header.
+We encourage you to keep this file on your own disk, keeping an
+electronic path open for the next readers. Do not remove this.
+
+
+**Welcome To The World of Free Plain Vanilla Electronic Texts**
+
+**Etexts Readable By Both Humans and By Computers, Since 1971**
+
+*These Etexts Prepared By Hundreds of Volunteers and Donations*
+
+Information on contacting Project Gutenberg to get Etexts, and
+further information is included below. We need your donations.
+
+
+The Tragedie of Macbeth
+
+by William Shakespeare
+
+July, 2000 [Etext #2264]
+
+
+***The Project Gutenberg's Etext of Shakespeare's First Folio***
+********************The Tragedie of Macbeth*********************
+
+*****This file should be named 0ws3410.txt or 0ws3410.zip******
+
+Corrected EDITIONS of our etexts get a new NUMBER, 0ws3411.txt
+VERSIONS based on separate sources get new LETTER, 0ws3410a.txt
+
+
+Project Gutenberg Etexts are usually created from multiple editions,
+all of which are in the Public Domain in the United States, unless a
+copyright notice is included. Therefore, we usually do NOT keep any
+of these books in compliance with any particular paper edition.
+
+
+We are now trying to release all our books one month in advance
+of the official release dates, leaving time for better editing.
+
+Please note: neither this list nor its contents are final till
+midnight of the last day of the month of any such announcement.
+The official release date of all Project Gutenberg Etexts is at
+Midnight, Central Time, of the last day of the stated month. A
+preliminary version may often be posted for suggestion, comment
+and editing by those who wish to do so. To be sure you have an
+up to date first edition [xxxxx10x.xxx] please check file sizes
+in the first week of the next month. Since our ftp program has
+a bug in it that scrambles the date [tried to fix and failed] a
+look at the file size will have to do, but we will try to see a
+new copy has at least one byte more or less.
+
+
+Information about Project Gutenberg (one page)
+
+We produce about two million dollars for each hour we work. The
+time it takes us, a rather conservative estimate, is fifty hours
+to get any etext selected, entered, proofread, edited, copyright
+searched and analyzed, the copyright letters written, etc. This
+projected audience is one hundred million readers. If our value
+per text is nominally estimated at one dollar then we produce $2
+million dollars per hour this year as we release thirty-six text
+files per month, or 432 more Etexts in 1999 for a total of 2000+
+If these reach just 10% of the computerized population, then the
+total should reach over 200 billion Etexts given away this year.
+
+The Goal of Project Gutenberg is to Give Away One Trillion Etext
+Files by December 31, 2001. [10,000 x 100,000,000 = 1 Trillion]
+This is ten thousand titles each to one hundred million readers,
+which is only ~5% of the present number of computer users.
+
+At our revised rates of production, we will reach only one-third
+of that goal by the end of 2001, or about 3,333 Etexts unless we
+manage to get some real funding; currently our funding is mostly
+from Michael Hart's salary at Carnegie-Mellon University, and an
+assortment of sporadic gifts; this salary is only good for a few
+more years, so we are looking for something to replace it, as we
+don't want Project Gutenberg to be so dependent on one person.
+
+We need your donations more than ever!
+
+
+All donations should be made to "Project Gutenberg/CMU": and are
+tax deductible to the extent allowable by law. (CMU = Carnegie-
+Mellon University).
+
+For these and other matters, please mail to:
+
+Project Gutenberg
+P. O. Box 2782
+Champaign, IL 61825
+
+When all other email fails. . .try our Executive Director:
+Michael S. Hart <ha...@pobox.com>
+hart@pobox.com forwards to hart@prairienet.org and archive.org
+if your mail bounces from archive.org, I will still see it, if
+it bounces from prairienet.org, better resend later on. . . .
+
+We would prefer to send you this information by email.
+
+******
+
+To access Project Gutenberg etexts, use any Web browser
+to view http://promo.net/pg. This site lists Etexts by
+author and by title, and includes information about how
+to get involved with Project Gutenberg. You could also
+download our past Newsletters, or subscribe here. This
+is one of our major sites, please email hart@pobox.com,
+for a more complete list of our various sites.
+
+To go directly to the etext collections, use FTP or any
+Web browser to visit a Project Gutenberg mirror (mirror
+sites are available on 7 continents; mirrors are listed
+at http://promo.net/pg).
+
+Mac users, do NOT point and click, typing works better.
+
+Example FTP session:
+
+ftp sunsite.unc.edu
+login: anonymous
+password: your@login
+cd pub/docs/books/gutenberg
+cd etext90 through etext99
+dir [to see files]
+get or mget [to get files. . .set bin for zip files]
+GET GUTINDEX.?? [to get a year's listing of books, e.g., GUTINDEX.99]
+GET GUTINDEX.ALL [to get a listing of ALL books]
+
+***
+
+**Information prepared by the Project Gutenberg legal advisor**
+
+(Three Pages)
+
+
+***START**THE SMALL PRINT!**FOR PUBLIC DOMAIN ETEXTS**START***
+Why is this "Small Print!" statement here? You know: lawyers.
+They tell us you might sue us if there is something wrong with
+your copy of this etext, even if you got it for free from
+someone other than us, and even if what's wrong is not our
+fault. So, among other things, this "Small Print!" statement
+disclaims most of our liability to you. It also tells you how
+you can distribute copies of this etext if you want to.
+
+*BEFORE!* YOU USE OR READ THIS ETEXT
+By using or reading any part of this PROJECT GUTENBERG-tm
+etext, you indicate that you understand, agree to and accept
+this "Small Print!" statement. If you do not, you can receive
+a refund of the money (if any) you paid for this etext by
+sending a request within 30 days of receiving it to the person
+you got it from. If you received this etext on a physical
+medium (such as a disk), you must return it with your request.
+
+ABOUT PROJECT GUTENBERG-TM ETEXTS
+This PROJECT GUTENBERG-tm etext, like most PROJECT GUTENBERG-
+tm etexts, is a "public domain" work distributed by Professor
+Michael S. Hart through the Project Gutenberg Association at
+Carnegie-Mellon University (the "Project"). Among other
+things, this means that no one owns a United States copyright
+on or for this work, so the Project (and you!) can copy and
+distribute it in the United States without permission and
+without paying copyright royalties. Special rules, set forth
+below, apply if you wish to copy and distribute this etext
+under the Project's "PROJECT GUTENBERG" trademark.
+
+To create these etexts, the Project expends considerable
+efforts to identify, transcribe and proofread public domain
+works. Despite these efforts, the Project's etexts and any
+medium they may be on may contain "Defects". Among other
+things, Defects may take the form of incomplete, inaccurate or
+corrupt data, transcription errors, a copyright or other
+intellectual property infringement, a defective or damaged
+disk or other etext medium, a computer virus, or computer
+codes that damage or cannot be read by your equipment.
+
+LIMITED WARRANTY; DISCLAIMER OF DAMAGES
+But for the "Right of Replacement or Refund" described below,
+[1] the Project (and any other party you may receive this
+etext from as a PROJECT GUTENBERG-tm etext) disclaims all
+liability to you for damages, costs and expenses, including
+legal fees, and [2] YOU HAVE NO REMEDIES FOR NEGLIGENCE OR
+UNDER STRICT LIABILITY, OR FOR BREACH OF WARRANTY OR CONTRACT,
+INCLUDING BUT NOT LIMITED TO INDIRECT, CONSEQUENTIAL, PUNITIVE
+OR INCIDENTAL DAMAGES, EVEN IF YOU GIVE NOTICE OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+If you discover a Defect in this etext within 90 days of
+receiving it, you can receive a refund of the money (if any)
+you paid for it by sending an explanatory note within that
+time to the person you received it from. If you received it
+on a physical medium, you must return it with your note, and
+such person may choose to alternatively give you a replacement
+copy. If you received it electronically, such person may
+choose to alternatively give you a second opportunity to
+receive it electronically.
+
+THIS ETEXT IS OTHERWISE PROVIDED TO YOU "AS-IS". NO OTHER
+WARRANTIES OF ANY KIND, EXPRESS OR IMPLIED, ARE MADE TO YOU AS
+TO THE ETEXT OR ANY MEDIUM IT MAY BE ON, INCLUDING BUT NOT
+LIMITED TO WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A
+PARTICULAR PURPOSE.
+
+Some states do not allow disclaimers of implied warranties or
+the exclusion or limitation of consequential damages, so the
+above disclaimers and exclusions may not apply to you, and you
+may have other legal rights.
+
+INDEMNITY
+You will indemnify and hold the Project, its directors,
+officers, members and agents harmless from all liability, cost
+and expense, including legal fees, that arise directly or
+indirectly from any of the following that you do or cause:
+[1] distribution of this etext, [2] alteration, modification,
+or addition to the etext, or [3] any Defect.
+
+DISTRIBUTION UNDER "PROJECT GUTENBERG-tm"
+You may distribute copies of this etext electronically, or by
+disk, book or any other medium if you either delete this
+"Small Print!" and all other references to Project Gutenberg,
+or:
+
+[1] Only give exact copies of it. Among other things, this
+ requires that you do not remove, alter or modify the
+ etext or this "small print!" statement. You may however,
+ if you wish, distribute this etext in machine readable
+ binary, compressed, mark-up, or proprietary form,
+ including any form resulting from conversion by word pro-
+ cessing or hypertext software, but only so long as
+ *EITHER*:
+
+ [*] The etext, when displayed, is clearly readable, and
+ does *not* contain characters other than those
+ intended by the author of the work, although tilde
+ (~), asterisk (*) and underline (_) characters may
+ be used to convey punctuation intended by the
+ author, and additional characters may be used to
+ indicate hypertext links; OR
+
+ [*] The etext may be readily converted by the reader at
+ no expense into plain ASCII, EBCDIC or equivalent
+ form by the program that displays the etext (as is
+ the case, for instance, with most word processors);
+ OR
+
+ [*] You provide, or agree to also provide on request at
+ no additional cost, fee or expense, a copy of the
+ etext in its original plain ASCII form (or in EBCDIC
+ or other equivalent proprietary form).
+
+[2] Honor the etext refund and replacement provisions of this
+ "Small Print!" statement.
+
+[3] Pay a trademark license fee to the Project of 20% of the
+ net profits you derive calculated using the method you
+ already use to calculate your applicable taxes. If you
+ don't derive profits, no royalty is due. Royalties are
+ payable to "Project Gutenberg Association/Carnegie-Mellon
+ University" within the 60 days following each
+ date you prepare (or were legally required to prepare)
+ your annual (or equivalent periodic) tax return.
+
+WHAT IF YOU *WANT* TO SEND MONEY EVEN IF YOU DON'T HAVE TO?
+The Project gratefully accepts contributions in money, time,
+scanning machines, OCR software, public domain etexts, royalty
+free copyright licenses, and every other sort of contribution
+you can think of. Money should be paid to "Project Gutenberg
+Association / Carnegie-Mellon University".
+
+*END*THE SMALL PRINT! FOR PUBLIC DOMAIN ETEXTS*Ver.04.29.93*END*
+
+
+
+
+
+Project Gutenberg's Etext of Shakespeare's The Tragedie of Macbeth
+
+
+
+
+
+Executive Director's Notes:
+
+In addition to the notes below, and so you will *NOT* think all
+the spelling errors introduced by the printers of the time have
+been corrected, here are the first few lines of Hamlet, as they
+are presented herein:
+
+ Barnardo. Who's there?
+ Fran. Nay answer me: Stand & vnfold
+your selfe
+
+ Bar. Long liue the King
+
+***
+
+As I understand it, the printers often ran out of certain words
+or letters they had often packed into a "cliche". . .this is the
+original meaning of the term cliche. . .and thus, being unwilling
+to unpack the cliches, and thus you will see some substitutions
+that look very odd. . .such as the exchanges of u for v, v for u,
+above. . .and you may wonder why they did it this way, presuming
+Shakespeare did not actually write the play in this manner. . . .
+
+The answer is that they MAY have packed "liue" into a cliche at a
+time when they were out of "v"'s. . .possibly having used "vv" in
+place of some "w"'s, etc. This was a common practice of the day,
+as print was still quite expensive, and they didn't want to spend
+more on a wider selection of characters than they had to.
+
+You will find a lot of these kinds of "errors" in this text, as I
+have mentioned in other times and places, many "scholars" have an
+extreme attachment to these errors, and many have accorded them a
+very high place in the "canon" of Shakespeare. My father read an
+assortment of these made available to him by Cambridge University
+in England for several months in a glass room constructed for the
+purpose. To the best of my knowledge he read ALL those available
+. . .in great detail. . .and determined from the various changes,
+that Shakespeare most likely did not write in nearly as many of a
+variety of errors we credit him for, even though he was in/famous
+for signing his name with several different spellings.
+
+So, please take this into account when reading the comments below
+made by our volunteer who prepared this file: you may see errors
+that are "not" errors. . . .
+
+So. . .with this caveat. . .we have NOT changed the canon errors,
+here is the Project Gutenberg Etext of Shakespeare's The Tragedie
+of Macbeth.
+
+Michael S. Hart
+Project Gutenberg
+Executive Director
+
+
+***
+
+
+Scanner's Notes: What this is and isn't. This was taken from
+a copy of Shakespeare's first folio and it is as close as I can
+come in ASCII to the printed text.
+
+The elongated S's have been changed to small s's and the
+conjoined ae have been changed to ae. I have left the spelling,
+punctuation, capitalization as close as possible to the
+printed text. I have corrected some spelling mistakes (I have put
+together a spelling dictionary devised from the spellings of the
+Geneva Bible and Shakespeare's First Folio and have unified
+spellings according to this template), typo's and expanded
+abbreviations as I have come across them. Everything within
+brackets [] is what I have added. So if you don't like that
+you can delete everything within the brackets if you want a
+purer Shakespeare.
+
+Another thing that you should be aware of is that there are textual
+differences between various copies of the first folio. So there may
+be differences (other than what I have mentioned above) between
+this and other first folio editions. This is due to the printer's
+habit of setting the type and running off a number of copies and
+then proofing the printed copy and correcting the type and then
+continuing the printing run. The proof run wasn't thrown away but
+incorporated into the printed copies. This is just the way it is.
+The text I have used was a composite of more than 30 different
+First Folio editions' best pages.
+
+If you find any scanning errors, out and out typos, punctuation
+errors, or if you disagree with my spelling choices please feel
+free to email me those errors. I wish to make this the best
+etext possible. My email address for right now are haradda@aol.com
+and davidr@inconnect.com. I hope that you enjoy this.
+
+David Reed
+
+The Tragedie of Macbeth
+
+Actus Primus. Scoena Prima.
+
+Thunder and Lightning. Enter three Witches.
+
+ 1. When shall we three meet againe?
+In Thunder, Lightning, or in Raine?
+ 2. When the Hurley-burley's done,
+When the Battaile's lost, and wonne
+
+ 3. That will be ere the set of Sunne
+
+ 1. Where the place?
+ 2. Vpon the Heath
+
+ 3. There to meet with Macbeth
+
+ 1. I come, Gray-Malkin
+
+ All. Padock calls anon: faire is foule, and foule is faire,
+Houer through the fogge and filthie ayre.
+
+Exeunt.
+
+
+Scena Secunda.
+
+Alarum within. Enter King Malcome, Donalbaine, Lenox, with
+attendants,
+meeting a bleeding Captaine.
+
+ King. What bloody man is that? he can report,
+As seemeth by his plight, of the Reuolt
+The newest state
+
+ Mal. This is the Serieant,
+Who like a good and hardie Souldier fought
+'Gainst my Captiuitie: Haile braue friend;
+Say to the King, the knowledge of the Broyle,
+As thou didst leaue it
+
+ Cap. Doubtfull it stood,
+As two spent Swimmers, that doe cling together,
+And choake their Art: The mercilesse Macdonwald
+(Worthie to be a Rebell, for to that
+The multiplying Villanies of Nature
+Doe swarme vpon him) from the Westerne Isles
+Of Kernes and Gallowgrosses is supply'd,
+And Fortune on his damned Quarry smiling,
+Shew'd like a Rebells Whore: but all's too weake:
+For braue Macbeth (well hee deserues that Name)
+Disdayning Fortune, with his brandisht Steele,
+Which smoak'd with bloody execution
+(Like Valours Minion) caru'd out his passage,
+Till hee fac'd the Slaue:
+Which neu'r shooke hands, nor bad farwell to him,
+Till he vnseam'd him from the Naue toth' Chops,
+And fix'd his Head vpon our Battlements
+
+ King. O valiant Cousin, worthy Gentleman
+
+ Cap. As whence the Sunne 'gins his reflection,
+Shipwracking Stormes, and direfull Thunders:
+So from that Spring, whence comfort seem'd to come,
+Discomfort swells: Marke King of Scotland, marke,
+No sooner Iustice had, with Valour arm'd,
+Compell'd these skipping Kernes to trust their heeles,
+But the Norweyan Lord, surueying vantage,
+With furbusht Armes, and new supplyes of men,
+Began a fresh assault
+
+ King. Dismay'd not this our Captaines, Macbeth and
+Banquoh?
+ Cap. Yes, as Sparrowes, Eagles;
+Or the Hare, the Lyon:
+If I say sooth, I must report they were
+As Cannons ouer-charg'd with double Cracks,
+So they doubly redoubled stroakes vpon the Foe:
+Except they meant to bathe in reeking Wounds,
+Or memorize another Golgotha,
+I cannot tell: but I am faint,
+My Gashes cry for helpe
+
+ King. So well thy words become thee, as thy wounds,
+They smack of Honor both: Goe get him Surgeons.
+Enter Rosse and Angus.
+
+Who comes here?
+ Mal. The worthy Thane of Rosse
+
+ Lenox. What a haste lookes through his eyes?
+So should he looke, that seemes to speake things strange
+
+ Rosse. God saue the King
+
+ King. Whence cam'st thou, worthy Thane?
+ Rosse. From Fiffe, great King,
+Where the Norweyan Banners flowt the Skie,
+And fanne our people cold.
+Norway himselfe, with terrible numbers,
+Assisted by that most disloyall Traytor,
+The Thane of Cawdor, began a dismall Conflict,
+Till that Bellona's Bridegroome, lapt in proofe,
+Confronted him with selfe-comparisons,
+Point against Point, rebellious Arme 'gainst Arme,
+Curbing his lauish spirit: and to conclude,
+The Victorie fell on vs
+
+ King. Great happinesse
+
+ Rosse. That now Sweno, the Norwayes King,
+Craues composition:
+Nor would we deigne him buriall of his men,
+Till he disbursed, at Saint Colmes ynch,
+Ten thousand Dollars, to our generall vse
+
+ King. No more that Thane of Cawdor shall deceiue
+Our Bosome interest: Goe pronounce his present death,
+And with his former Title greet Macbeth
+
+ Rosse. Ile see it done
+
+ King. What he hath lost, Noble Macbeth hath wonne.
+
+Exeunt.
+
+
+Scena Tertia.
+
+Thunder. Enter the three Witches.
+
+ 1. Where hast thou beene, Sister?
+ 2. Killing Swine
+
+ 3. Sister, where thou?
+ 1. A Saylors Wife had Chestnuts in her Lappe,
+And mouncht, & mouncht, and mouncht:
+Giue me, quoth I.
+Aroynt thee, Witch, the rumpe-fed Ronyon cryes.
+Her Husband's to Aleppo gone, Master o'th' Tiger:
+But in a Syue Ile thither sayle,
+And like a Rat without a tayle,
+Ile doe, Ile doe, and Ile doe
+
+ 2. Ile giue thee a Winde
+
+ 1. Th'art kinde
+
+ 3. And I another
+
+ 1. I my selfe haue all the other,
+And the very Ports they blow,
+All the Quarters that they know,
+I'th' Ship-mans Card.
+Ile dreyne him drie as Hay:
+Sleepe shall neyther Night nor Day
+Hang vpon his Pent-house Lid:
+He shall liue a man forbid:
+Wearie Seu'nights, nine times nine,
+Shall he dwindle, peake, and pine:
+Though his Barke cannot be lost,
+Yet it shall be Tempest-tost.
+Looke what I haue
+
+ 2. Shew me, shew me
+
+ 1. Here I haue a Pilots Thumbe,
+Wrackt, as homeward he did come.
+
+Drum within.
+
+ 3. A Drumme, a Drumme:
+Macbeth doth come
+
+ All. The weyward Sisters, hand in hand,
+Posters of the Sea and Land,
+Thus doe goe, about, about,
+Thrice to thine, and thrice to mine,
+And thrice againe, to make vp nine.
+Peace, the Charme's wound vp.
+Enter Macbeth and Banquo.
+
+ Macb. So foule and faire a day I haue not seene
+
+ Banquo. How farre is't call'd to Soris? What are these,
+So wither'd, and so wilde in their attyre,
+That looke not like th' Inhabitants o'th' Earth,
+And yet are on't? Liue you, or are you aught
+That man may question? you seeme to vnderstand me,
+By each at once her choppie finger laying
+Vpon her skinnie Lips: you should be Women,
+And yet your Beards forbid me to interprete
+That you are so
+
+ Mac. Speake if you can: what are you?
+ 1. All haile Macbeth, haile to thee Thane of Glamis
+
+ 2. All haile Macbeth, haile to thee Thane of Cawdor
+
+ 3. All haile Macbeth, that shalt be King hereafter
+
+ Banq. Good Sir, why doe you start, and seeme to feare
+Things that doe sound so faire? i'th' name of truth
+Are ye fantasticall, or that indeed
+Which outwardly ye shew? My Noble Partner
+You greet with present Grace, and great prediction
+Of Noble hauing, and of Royall hope,
+That he seemes wrapt withall: to me you speake not.
+If you can looke into the Seedes of Time,
+And say, which Graine will grow, and which will not,
+Speake then to me, who neyther begge, nor feare
+Your fauors, nor your hate
+
+ 1. Hayle
+
+ 2. Hayle
+
+ 3. Hayle
+
+ 1. Lesser than Macbeth, and greater
+
+ 2. Not so happy, yet much happyer
+
+ 3. Thou shalt get Kings, though thou be none:
+So all haile Macbeth, and Banquo
+
+ 1. Banquo, and Macbeth, all haile
+
+ Macb. Stay you imperfect Speakers, tell me more:
+By Sinells death, I know I am Thane of Glamis,
+But how, of Cawdor? the Thane of Cawdor liues
+A prosperous Gentleman: And to be King,
+Stands not within the prospect of beleefe,
+No more then to be Cawdor. Say from whence
+You owe this strange Intelligence, or why
+Vpon this blasted Heath you stop our way
+With such Prophetique greeting?
+Speake, I charge you.
+
+Witches vanish.
+
+ Banq. The Earth hath bubbles, as the Water ha's,
+And these are of them: whither are they vanish'd?
+ Macb. Into the Ayre: and what seem'd corporall,
+Melted, as breath into the Winde.
+Would they had stay'd
+
+ Banq. Were such things here, as we doe speake about?
+Or haue we eaten on the insane Root,
+That takes the Reason Prisoner?
+ Macb. Your Children shall be Kings
+
+ Banq. You shall be King
+
+ Macb. And Thane of Cawdor too: went it not so?
+ Banq. Toth' selfe-same tune and words: who's here?
+Enter Rosse and Angus.
+
+ Rosse. The King hath happily receiu'd, Macbeth,
+The newes of thy successe: and when he reades
+Thy personall Venture in the Rebels sight,
+His Wonders and his Prayses doe contend,
+Which should be thine, or his: silenc'd with that,
+In viewing o're the rest o'th' selfe-same day,
+He findes thee in the stout Norweyan Rankes,
+Nothing afeard of what thy selfe didst make
+Strange Images of death, as thick as Tale
+Can post with post, and euery one did beare
+Thy prayses in his Kingdomes great defence,
+And powr'd them downe before him
+
+ Ang. Wee are sent,
+To giue thee from our Royall Master thanks,
+Onely to harrold thee into his sight,
+Not pay thee
+
+ Rosse. And for an earnest of a greater Honor,
+He bad me, from him, call thee Thane of Cawdor:
+In which addition, haile most worthy Thane,
+For it is thine
+
+ Banq. What, can the Deuill speake true?
+ Macb. The Thane of Cawdor liues:
+Why doe you dresse me in borrowed Robes?
+ Ang. Who was the Thane, liues yet,
+But vnder heauie Iudgement beares that Life,
+Which he deserues to loose.
+Whether he was combin'd with those of Norway,
+Or did lyne the Rebell with hidden helpe,
+And vantage; or that with both he labour'd
+In his Countreyes wracke, I know not:
+But Treasons Capitall, confess'd, and prou'd,
+Haue ouerthrowne him
+
+ Macb. Glamys, and Thane of Cawdor:
+The greatest is behinde. Thankes for your paines.
+Doe you not hope your Children shall be Kings,
+When those that gaue the Thane of Cawdor to me,
+Promis'd no lesse to them
+
+ Banq. That trusted home,
+Might yet enkindle you vnto the Crowne,
+Besides the Thane of Cawdor. But 'tis strange:
+And oftentimes, to winne vs to our harme,
+The Instruments of Darknesse tell vs Truths,
+Winne vs with honest Trifles, to betray's
+In deepest consequence.
+Cousins, a word, I pray you
+
+ Macb. Two Truths are told,
+As happy Prologues to the swelling Act
+Of the Imperiall Theame. I thanke you Gentlemen:
+This supernaturall solliciting
+Cannot be ill; cannot be good.
+If ill? why hath it giuen me earnest of successe,
+Commencing in a Truth? I am Thane of Cawdor.
+If good? why doe I yeeld to that suggestion,
+Whose horrid Image doth vnfixe my Heire,
+And make my seated Heart knock at my Ribbes,
+Against the vse of Nature? Present Feares
+Are lesse then horrible Imaginings:
+My Thought, whose Murther yet is but fantasticall,
+Shakes so my single state of Man,
+That Function is smother'd in surmise,
+And nothing is, but what is not
+
+ Banq. Looke how our Partner's rapt
+
+ Macb. If Chance will haue me King,
+Why Chance may Crowne me,
+Without my stirre
+
+ Banq. New Honors come vpon him
+Like our strange Garments, cleaue not to their mould,
+But with the aid of vse
+
+ Macb. Come what come may,
+Time, and the Houre, runs through the roughest Day
+
+ Banq. Worthy Macbeth, wee stay vpon your leysure
+
+ Macb. Giue me your fauour:
+My dull Braine was wrought with things forgotten.
+Kinde Gentlemen, your paines are registred,
+Where euery day I turne the Leafe,
+To reade them.
+Let vs toward the King: thinke vpon
+What hath chanc'd: and at more time,
+The Interim hauing weigh'd it, let vs speake
+Our free Hearts each to other
+
+ Banq. Very gladly
+
+ Macb. Till then enough:
+Come friends.
+
+Exeunt.
+
+
+Scena Quarta.
+
+Flourish. Enter King, Lenox, Malcolme, Donalbaine, and
+Attendants.
+
+ King. Is execution done on Cawdor?
+Or not those in Commission yet return'd?
+ Mal. My Liege, they are not yet come back.
+But I haue spoke with one that saw him die:
+Who did report, that very frankly hee
+Confess'd his Treasons, implor'd your Highnesse Pardon,
+And set forth a deepe Repentance:
+Nothing in his Life became him,
+Like the leauing it. Hee dy'de,
+As one that had beene studied in his death,
+To throw away the dearest thing he ow'd,
+As 'twere a carelesse Trifle
+
+ King. There's no Art,
+To finde the Mindes construction in the Face.
+He was a Gentleman, on whom I built
+An absolute Trust.
+Enter Macbeth, Banquo, Rosse, and Angus.
+
+O worthyest Cousin,
+The sinne of my Ingratitude euen now
+Was heauie on me. Thou art so farre before,
+That swiftest Wing of Recompence is slow,
+To ouertake thee. Would thou hadst lesse deseru'd,
+That the proportion both of thanks, and payment,
+Might haue beene mine: onely I haue left to say,
+More is thy due, then more then all can pay
+
+ Macb. The seruice, and the loyaltie I owe,
+In doing it, payes it selfe.
+Your Highnesse part, is to receiue our Duties:
+And our Duties are to your Throne, and State,
+Children, and Seruants; which doe but what they should,
+By doing euery thing safe toward your Loue
+And Honor
+
+ King. Welcome hither:
+I haue begun to plant thee, and will labour
+To make thee full of growing. Noble Banquo,
+That hast no lesse deseru'd, nor must be knowne
+No lesse to haue done so: Let me enfold thee,
+And hold thee to my Heart
+
+ Banq. There if I grow,
+The Haruest is your owne
+
+ King. My plenteous Ioyes,
+Wanton in fulnesse, seeke to hide themselues
+In drops of sorrow. Sonnes, Kinsmen, Thanes,
+And you whose places are the nearest, know,
+We will establish our Estate vpon
+Our eldest, Malcolme, whom we name hereafter,
+The Prince of Cumberland: which Honor must
+Not vnaccompanied, inuest him onely,
+But signes of Noblenesse, like Starres, shall shine
+On all deseruers. From hence to Envernes,
+And binde vs further to you
+
+ Macb. The Rest is Labor, which is not vs'd for you:
+Ile be my selfe the Herbenger, and make ioyfull
+The hearing of my Wife, with your approach:
+So humbly take my leaue
+
+ King. My worthy Cawdor
+
+ Macb. The Prince of Cumberland: that is a step,
+On which I must fall downe, or else o're-leape,
+For in my way it lyes. Starres hide your fires,
+Let not Light see my black and deepe desires:
+The Eye winke at the Hand: yet let that bee,
+Which the Eye feares, when it is done to see.
+Enter.
+
+ King. True worthy Banquo: he is full so valiant,
+And in his commendations, I am fed:
+It is a Banquet to me. Let's after him,
+Whose care is gone before, to bid vs welcome:
+It is a peerelesse Kinsman.
+
+Flourish. Exeunt.
+
+
+Scena Quinta.
+
+Enter Macbeths Wife alone with a Letter.
+
+ Lady. They met me in the day of successe: and I haue
+learn'd by the perfect'st report, they haue more in them, then
+mortall knowledge. When I burnt in desire to question them
+further, they made themselues Ayre, into which they vanish'd.
+Whiles I stood rapt in the wonder of it, came Missiues from
+the King, who all-hail'd me Thane of Cawdor, by which Title
+before, these weyward Sisters saluted me, and referr'd me to
+the comming on of time, with haile King that shalt be. This
+haue I thought good to deliuer thee (my dearest Partner of
+Greatnesse) that thou might'st not loose the dues of reioycing
+by being ignorant of what Greatnesse is promis'd thee. Lay
+it to thy heart and farewell.
+Glamys thou art, and Cawdor, and shalt be
+What thou art promis'd: yet doe I feare thy Nature,
+It is too full o'th' Milke of humane kindnesse,
+To catch the neerest way. Thou would'st be great,
+Art not without Ambition, but without
+The illnesse should attend it. What thou would'st highly,
+That would'st thou holily: would'st not play false,
+And yet would'st wrongly winne.
+Thould'st haue, great Glamys, that which cryes,
+Thus thou must doe, if thou haue it;
+And that which rather thou do'st feare to doe,
+Then wishest should be vndone. High thee hither,
+That I may powre my Spirits in thine Eare,
+And chastise with the valour of my Tongue
+All that impeides thee from the Golden Round,
+Which Fate and Metaphysicall ayde doth seeme
+To haue thee crown'd withall.
+Enter Messenger.
+
+What is your tidings?
+ Mess. The King comes here to Night
+
+ Lady. Thou'rt mad to say it.
+Is not thy Master with him? who, wer't so,
+Would haue inform'd for preparation
+
+ Mess. So please you, it is true: our Thane is comming:
+One of my fellowes had the speed of him;
+Who almost dead for breath, had scarcely more
+Then would make vp his Message
+
+ Lady. Giue him tending,
+He brings great newes,
+
+Exit Messenger.
+
+The Rauen himselfe is hoarse,
+That croakes the fatall entrance of Duncan
+Vnder my Battlements. Come you Spirits,
+That tend on mortall thoughts, vnsex me here,
+And fill me from the Crowne to the Toe, top-full
+Of direst Crueltie: make thick my blood,
+Stop vp th' accesse, and passage to Remorse,
+That no compunctious visitings of Nature
+Shake my fell purpose, nor keepe peace betweene
+Th' effect, and hit. Come to my Womans Brests,
+And take my Milke for Gall, you murth'ring Ministers,
+Where-euer, in your sightlesse substances,
+You wait on Natures Mischiefe. Come thick Night,
+And pall thee in the dunnest smoake of Hell,
+
+That my keene Knife see not the Wound it makes,
+Nor Heauen peepe through the Blanket of the darke,
+To cry, hold, hold.
+Enter Macbeth.
+
+Great Glamys, worthy Cawdor,
+Greater then both, by the all-haile hereafter,
+Thy Letters haue transported me beyond
+This ignorant present, and I feele now
+The future in the instant
+
+ Macb. My dearest Loue,
+Duncan comes here to Night
+
+ Lady. And when goes hence?
+ Macb. To morrow, as he purposes
+
+ Lady. O neuer,
+Shall Sunne that Morrow see.
+Your Face, my Thane, is as a Booke, where men
+May reade strange matters, to beguile the time.
+Looke like the time, beare welcome in your Eye,
+Your Hand, your Tongue: looke like th' innocent flower,
+But be the Serpent vnder't. He that's comming,
+Must be prouided for: and you shall put
+This Nights great Businesse into my dispatch,
+Which shall to all our Nights, and Dayes to come,
+Giue solely soueraigne sway, and Masterdome
+
+ Macb. We will speake further,
+ Lady. Onely looke vp cleare:
+To alter fauor, euer is to feare:
+Leaue all the rest to me.
+
+Exeunt.
+
+
+Scena Sexta.
+
+Hoboyes, and Torches. Enter King, Malcolme, Donalbaine,
+Banquo, Lenox,
+Macduff, Rosse, Angus, and Attendants.
+
+ King. This Castle hath a pleasant seat,
+The ayre nimbly and sweetly recommends it selfe
+Vnto our gentle sences
+
+ Banq. This Guest of Summer,
+The Temple-haunting Barlet does approue,
+By his loued Mansonry, that the Heauens breath
+Smells wooingly here: no Iutty frieze,
+Buttrice, nor Coigne of Vantage, but this Bird
+Hath made his pendant Bed, and procreant Cradle,
+Where they must breed, and haunt: I haue obseru'd
+The ayre is delicate.
+Enter Lady.
+
+ King. See, see our honor'd Hostesse:
+The Loue that followes vs, sometime is our trouble,
+Which still we thanke as Loue. Herein I teach you,
+How you shall bid God-eyld vs for your paines,
+And thanke vs for your trouble
+
+ Lady. All our seruice,
+In euery point twice done, and then done double,
+Were poore, and single Businesse, to contend
+Against those Honors deepe, and broad,
+Wherewith your Maiestie loades our House:
+For those of old, and the late Dignities,
+Heap'd vp to them, we rest your Ermites
+
+ King. Where's the Thane of Cawdor?
+We courst him at the heeles, and had a purpose
+To be his Purueyor: But he rides well,
+And his great Loue (sharpe as his Spurre) hath holp him
+To his home before vs: Faire and Noble Hostesse
+We are your guest to night
+
+ La. Your Seruants euer,
+Haue theirs, themselues, and what is theirs in compt,
+To make their Audit at your Highnesse pleasure,
+Still to returne your owne
+
+ King. Giue me your hand:
+Conduct me to mine Host we loue him highly,
+And shall continue, our Graces towards him.
+By your leaue Hostesse.
+
+Exeunt.
+
+Scena Septima.
+
+Hoboyes. Torches. Enter a Sewer, and diuers Seruants with Dishes
+and
+Seruice ouer the Stage. Then enter Macbeth
+
+ Macb. If it were done, when 'tis done, then 'twer well,
+It were done quickly: If th' Assassination
+Could trammell vp the Consequence, and catch
+With his surcease, Successe: that but this blow
+Might be the be all, and the end all. Heere,
+But heere, vpon this Banke and Schoole of time,
+Wee'ld iumpe the life to come. But in these Cases,
+We still haue iudgement heere, that we but teach
+Bloody Instructions, which being taught, returne
+To plague th' Inuenter, this euen-handed Iustice
+Commends th' Ingredience of our poyson'd Challice
+To our owne lips. Hee's heere in double trust;
+First, as I am his Kinsman, and his Subiect,
+Strong both against the Deed: Then, as his Host,
+Who should against his Murtherer shut the doore,
+Not beare the knife my selfe. Besides, this Duncane
+Hath borne his Faculties so meeke; hath bin
+So cleere in his great Office, that his Vertues
+Will pleade like Angels, Trumpet-tongu'd against
+The deepe damnation of his taking off:
+And Pitty, like a naked New-borne-Babe,
+Striding the blast, or Heauens Cherubin, hors'd
+Vpon the sightlesse Curriors of the Ayre,
+Shall blow the horrid deed in euery eye,
+That teares shall drowne the winde. I haue no Spurre
+To pricke the sides of my intent, but onely
+Vaulting Ambition, which ore-leapes it selfe,
+And falles on th' other.
+Enter Lady.
+
+How now? What Newes?
+ La. He has almost supt: why haue you left the chamber?
+ Mac. Hath he ask'd for me?
+ La. Know you not, he ha's?
+ Mac. We will proceed no further in this Businesse:
+He hath Honour'd me of late, and I haue bought
+Golden Opinions from all sorts of people,
+Which would be worne now in their newest glosse,
+Not cast aside so soone
+
+ La. Was the hope drunke,
+Wherein you drest your selfe? Hath it slept since?
+And wakes it now to looke so greene, and pale,
+At what it did so freely? From this time,
+Such I account thy loue. Art thou affear'd
+To be the same in thine owne Act, and Valour,
+As thou art in desire? Would'st thou haue that
+Which thou esteem'st the Ornament of Life,
+And liue a Coward in thine owne Esteeme?
+Letting I dare not, wait vpon I would,
+Like the poore Cat i'th' Addage
+
+ Macb. Prythee peace:
+I dare do all that may become a man,
+Who dares do more, is none
+
+ La. What Beast was't then
+That made you breake this enterprize to me?
+When you durst do it, then you were a man:
+And to be more then what you were, you would
+Be so much more the man. Nor time, nor place
+Did then adhere, and yet you would make both:
+They haue made themselues, and that their fitnesse now
+Do's vnmake you. I haue giuen Sucke, and know
+How tender 'tis to loue the Babe that milkes me,
+I would, while it was smyling in my Face,
+Haue pluckt my Nipple from his Bonelesse Gummes,
+And dasht the Braines out, had I so sworne
+As you haue done to this
+
+ Macb. If we should faile?
+ Lady. We faile?
+But screw your courage to the sticking place,
+And wee'le not fayle: when Duncan is asleepe,
+(Whereto the rather shall his dayes hard Iourney
+Soundly inuite him) his two Chamberlaines
+Will I with Wine, and Wassell, so conuince,
+That Memorie, the Warder of the Braine,
+Shall be a Fume, and the Receit of Reason
+A Lymbeck onely: when in Swinish sleepe,
+Their drenched Natures lyes as in a Death,
+What cannot you and I performe vpon
+Th' vnguarded Duncan? What not put vpon
+His spungie Officers? who shall beare the guilt
+Of our great quell
+
+ Macb. Bring forth Men-Children onely:
+For thy vndaunted Mettle should compose
+Nothing but Males. Will it not be receiu'd,
+When we haue mark'd with blood those sleepie two
+Of his owne Chamber, and vs'd their very Daggers,
+That they haue don't?
+ Lady. Who dares receiue it other,
+As we shall make our Griefes and Clamor rore,
+Vpon his Death?
+ Macb. I am settled, and bend vp
+Each corporall Agent to this terrible Feat.
+Away, and mock the time with fairest show,
+False Face must hide what the false Heart doth know.
+
+Exeunt.
+
+
+Actus Secundus. Scena Prima.
+
+Enter Banquo, and Fleance, with a Torch before him.
+
+ Banq. How goes the Night, Boy?
+ Fleance. The Moone is downe: I haue not heard the
+Clock
+
+ Banq. And she goes downe at Twelue
+
+ Fleance. I take't, 'tis later, Sir
+
+ Banq. Hold, take my Sword:
+There's Husbandry in Heauen,
+Their Candles are all out: take thee that too.
+A heauie Summons lyes like Lead vpon me,
+And yet I would not sleepe:
+Mercifull Powers, restraine in me the cursed thoughts
+That Nature giues way to in repose.
+Enter Macbeth, and a Seruant with a Torch.
+
+Giue me my Sword: who's there?
+ Macb. A Friend
+
+ Banq. What Sir, not yet at rest? the King's a bed.
+He hath beene in vnusuall Pleasure,
+And sent forth great Largesse to your Offices.
+This Diamond he greetes your Wife withall,
+By the name of most kind Hostesse,
+And shut vp in measurelesse content
+
+ Mac. Being vnprepar'd,
+Our will became the seruant to defect,
+Which else should free haue wrought
+
+ Banq. All's well.
+I dreamt last Night of the three weyward Sisters:
+To you they haue shew'd some truth
+
+ Macb. I thinke not of them:
+Yet when we can entreat an houre to serue,
+We would spend it in some words vpon that Businesse,
+If you would graunt the time
+
+ Banq. At your kind'st leysure
+
+ Macb. If you shall cleaue to my consent,
+When 'tis, it shall make Honor for you
+
+ Banq. So I lose none,
+In seeking to augment it, but still keepe
+My Bosome franchis'd, and Allegeance cleare,
+I shall be counsail'd
+
+ Macb. Good repose the while
+
+ Banq. Thankes Sir: the like to you.
+
+Exit Banquo.
+
+ Macb. Goe bid thy Mistresse, when my drinke is ready,
+She strike vpon the Bell. Get thee to bed.
+Enter.
+
+Is this a Dagger, which I see before me,
+The Handle toward my Hand? Come, let me clutch thee:
+I haue thee not, and yet I see thee still.
+Art thou not fatall Vision, sensible
+To feeling, as to sight? or art thou but
+A Dagger of the Minde, a false Creation,
+Proceeding from the heat-oppressed Braine?
+I see thee yet, in forme as palpable,
+As this which now I draw.
+Thou marshall'st me the way that I was going,
+And such an Instrument I was to vse.
+Mine Eyes are made the fooles o'th' other Sences,
+Or else worth all the rest: I see thee still;
+And on thy Blade, and Dudgeon, Gouts of Blood,
+Which was not so before. There's no such thing:
+It is the bloody Businesse, which informes
+Thus to mine Eyes. Now o're the one halfe World
+Nature seemes dead, and wicked Dreames abuse
+The Curtain'd sleepe: Witchcraft celebrates
+Pale Heccats Offrings: and wither'd Murther,
+Alarum'd by his Centinell, the Wolfe,
+Whose howle's his Watch, thus with his stealthy pace,
+With Tarquins rauishing sides, towards his designe
+Moues like a Ghost. Thou sowre and firme-set Earth
+Heare not my steps, which they may walke, for feare
+Thy very stones prate of my where-about,
+And take the present horror from the time,
+Which now sutes with it. Whiles I threat, he liues:
+Words to the heat of deedes too cold breath giues.
+
+A Bell rings.
+
+I goe, and it is done: the Bell inuites me.
+Heare it not, Duncan, for it is a Knell,
+That summons thee to Heauen, or to Hell.
+Enter.
+
+
+Scena Secunda.
+
+Enter Lady.
+
+ La. That which hath made the[m] drunk, hath made me bold:
+What hath quench'd them, hath giuen me fire.
+Hearke, peace: it was the Owle that shriek'd,
+The fatall Bell-man, which giues the stern'st good-night.
+He is about it, the Doores are open:
+And the surfeted Groomes doe mock their charge
+With Snores. I haue drugg'd their Possets,
+That Death and Nature doe contend about them,
+Whether they liue, or dye.
+Enter Macbeth.
+
+ Macb. Who's there? what hoa?
+ Lady. Alack, I am afraid they haue awak'd,
+And 'tis not done: th' attempt, and not the deed,
+Confounds vs: hearke: I lay'd their Daggers ready,
+He could not misse 'em. Had he not resembled
+My Father as he slept, I had don't.
+My Husband?
+ Macb. I haue done the deed:
+Didst thou not heare a noyse?
+ Lady. I heard the Owle schreame, and the Crickets cry.
+Did not you speake?
+ Macb. When?
+ Lady. Now
+
+ Macb. As I descended?
+ Lady. I
+
+ Macb. Hearke, who lyes i'th' second Chamber?
+ Lady. Donalbaine
+
+ Mac. This is a sorry sight
+
+ Lady. A foolish thought, to say a sorry sight
+
+ Macb. There's one did laugh in's sleepe,
+And one cry'd Murther, that they did wake each other:
+I stood, and heard them: But they did say their Prayers,
+And addrest them againe to sleepe
+
+ Lady. There are two lodg'd together
+
+ Macb. One cry'd God blesse vs, and Amen the other,
+As they had seene me with these Hangmans hands:
+Listning their feare, I could not say Amen,
+When they did say God blesse vs
+
+ Lady. Consider it not so deepely
+
+ Mac. But wherefore could not I pronounce Amen?
+I had most need of Blessing, and Amen stuck in my throat
+
+ Lady. These deeds must not be thought
+After these wayes: so, it will make vs mad
+
+ Macb. Me thought I heard a voyce cry, Sleep no more:
+Macbeth does murther Sleepe, the innocent Sleepe,
+Sleepe that knits vp the rauel'd Sleeue of Care,
+The death of each dayes Life, sore Labors Bath,
+Balme of hurt Mindes, great Natures second Course,
+Chiefe nourisher in Life's Feast
+
+ Lady. What doe you meane?
+ Macb. Still it cry'd, Sleepe no more to all the House:
+Glamis hath murther'd Sleepe, and therefore Cawdor
+Shall sleepe no more: Macbeth shall sleepe no more
+
+ Lady. Who was it, that thus cry'd? why worthy Thane,
+You doe vnbend your Noble strength, to thinke
+So braine-sickly of things: Goe get some Water,
+And wash this filthie Witnesse from your Hand.
+Why did you bring these Daggers from the place?
+They must lye there: goe carry them, and smeare
+The sleepie Groomes with blood
+
+ Macb. Ile goe no more:
+I am afraid, to thinke what I haue done:
+Looke on't againe, I dare not
+
+ Lady. Infirme of purpose:
+Giue me the Daggers: the sleeping, and the dead,
+Are but as Pictures: 'tis the Eye of Childhood,
+That feares a painted Deuill. If he doe bleed,
+Ile guild the Faces of the Groomes withall,
+For it must seeme their Guilt.
+Enter.
+
+Knocke within.
+
+ Macb. Whence is that knocking?
+How is't with me, when euery noyse appalls me?
+What Hands are here? hah: they pluck out mine Eyes.
+Will all great Neptunes Ocean wash this blood
+Cleane from my Hand? no: this my Hand will rather
+The multitudinous Seas incarnardine,
+Making the Greene one, Red.
+Enter Lady.
+
+ Lady. My Hands are of your colour: but I shame
+To weare a Heart so white.
+
+Knocke.
+
+I heare a knocking at the South entry:
+Retyre we to our Chamber:
+A little Water cleares vs of this deed.
+How easie is it then? your Constancie
+Hath left you vnattended.
+
+Knocke.
+
+Hearke, more knocking.
+Get on your Night-Gowne, least occasion call vs,
+And shew vs to be Watchers: be not lost
+So poorely in your thoughts
+
+ Macb. To know my deed,
+
+Knocke.
+
+'Twere best not know my selfe.
+Wake Duncan with thy knocking:
+I would thou could'st.
+
+Exeunt.
+
+
+Scena Tertia.
+
+Enter a Porter. Knocking within.
+
+ Porter. Here's a knocking indeede: if a man were
+Porter of Hell Gate, hee should haue old turning the
+Key.
+
+Knock.
+
+Knock, Knock, Knock. Who's there
+i'th' name of Belzebub? Here's a Farmer, that hang'd
+himselfe on th' expectation of Plentie: Come in time, haue
+Napkins enow about you, here you'le sweat for't.
+
+Knock.
+
+Knock, knock. Who's there in th' other Deuils Name?
+Faith here's an Equiuocator, that could sweare in both
+the Scales against eyther Scale, who committed Treason
+enough for Gods sake, yet could not equiuocate to Heauen:
+oh come in, Equiuocator.
+
+Knock.
+
+Knock, Knock, Knock. Who's there? 'Faith here's an English
+Taylor come hither, for stealing out of a French Hose:
+Come in Taylor, here you may rost your Goose.
+Knock.
+
+Knock, Knock. Neuer at quiet: What are you? but this
+place is too cold for Hell. Ile Deuill-Porter it no further:
+I had thought to haue let in some of all Professions, that
+goe the Primrose way to th' euerlasting Bonfire.
+
+Knock.
+
+Anon, anon, I pray you remember the Porter.
+Enter Macduff, and Lenox.
+
+ Macd. Was it so late, friend, ere you went to Bed,
+That you doe lye so late?
+ Port. Faith Sir, we were carowsing till the second Cock:
+And Drinke, Sir, is a great prouoker of three things
+
+ Macd. What three things does Drinke especially
+prouoke?
+ Port. Marry, Sir, Nose-painting, Sleepe, and Vrine.
+Lecherie, Sir, it prouokes, and vnprouokes: it prouokes
+the desire, but it takes away the performance. Therefore
+much Drinke may be said to be an Equiuocator with Lecherie:
+it makes him, and it marres him; it sets him on,
+and it takes him off; it perswades him, and dis-heartens
+him; makes him stand too, and not stand too: in conclusion,
+equiuocates him in a sleepe, and giuing him the Lye,
+leaues him
+
+ Macd. I beleeue, Drinke gaue thee the Lye last Night
+
+ Port. That it did, Sir, i'the very Throat on me: but I
+requited him for his Lye, and (I thinke) being too strong
+for him, though he tooke vp my Legges sometime, yet I
+made a Shift to cast him.
+Enter Macbeth.
+
+ Macd. Is thy Master stirring?
+Our knocking ha's awak'd him: here he comes
+
+ Lenox. Good morrow, Noble Sir
+
+ Macb. Good morrow both
+
+ Macd. Is the King stirring, worthy Thane?
+ Macb. Not yet
+
+ Macd. He did command me to call timely on him,
+I haue almost slipt the houre
+
+ Macb. Ile bring you to him
+
+ Macd. I know this is a ioyfull trouble to you:
+But yet 'tis one
+
+ Macb. The labour we delight in, Physicks paine:
+This is the Doore
+
+ Macd. Ile make so bold to call, for 'tis my limitted
+seruice.
+
+Exit Macduffe.
+
+ Lenox. Goes the King hence to day?
+ Macb. He does: he did appoint so
+
+ Lenox. The Night ha's been vnruly:
+Where we lay, our Chimneys were blowne downe,
+And (as they say) lamentings heard i'th' Ayre;
+Strange Schreemes of Death,
+And Prophecying, with Accents terrible,
+Of dyre Combustion, and confus'd Euents,
+New hatch'd toth' wofull time.
+The obscure Bird clamor'd the liue-long Night.
+Some say, the Earth was Feuorous,
+And did shake
+
+ Macb. 'Twas a rough Night
+
+ Lenox. My young remembrance cannot paralell
+A fellow to it.
+Enter Macduff.
+
+ Macd. O horror, horror, horror,
+Tongue nor Heart cannot conceiue, nor name thee
+
+ Macb. and Lenox. What's the matter?
+ Macd. Confusion now hath made his Master-peece:
+Most sacrilegious Murther hath broke ope
+The Lords anoynted Temple, and stole thence
+The Life o'th' Building
+
+ Macb. What is't you say, the Life?
+ Lenox. Meane you his Maiestie?
+ Macd. Approch the Chamber, and destroy your sight
+With a new Gorgon. Doe not bid me speake:
+See, and then speake your selues: awake, awake,
+
+Exeunt. Macbeth and Lenox.
+
+Ring the Alarum Bell: Murther, and Treason,
+Banquo, and Donalbaine: Malcolme awake,
+Shake off this Downey sleepe, Deaths counterfeit,
+And looke on Death it selfe: vp, vp, and see
+The great Doomes Image: Malcolme, Banquo,
+As from your Graues rise vp, and walke like Sprights,
+To countenance this horror. Ring the Bell.
+
+Bell rings. Enter Lady.
+
+ Lady. What's the Businesse?
+That such a hideous Trumpet calls to parley
+The sleepers of the House? speake, speake
+
+ Macd. O gentle Lady,
+'Tis not for you to heare what I can speake:
+The repetition in a Womans eare,
+Would murther as it fell.
+Enter Banquo.
+
+O Banquo, Banquo, Our Royall Master's murther'd
+
+ Lady. Woe, alas:
+What, in our House?
+ Ban. Too cruell, any where.
+Deare Duff, I prythee contradict thy selfe,
+And say, it is not so.
+Enter Macbeth, Lenox, and Rosse.
+
+ Macb. Had I but dy'd an houre before this chance,
+I had liu'd a blessed time: for from this instant,
+There's nothing serious in Mortalitie:
+All is but Toyes: Renowne and Grace is dead,
+The Wine of Life is drawne, and the meere Lees
+Is left this Vault, to brag of.
+Enter Malcolme and Donalbaine.
+
+ Donal. What is amisse?
+ Macb. You are, and doe not know't:
+The Spring, the Head, the Fountaine of your Blood
+Is stopt, the very Source of it is stopt
+
+ Macd. Your Royall Father's murther'd
+
+ Mal. Oh, by whom?
+ Lenox. Those of his Chamber, as it seem'd, had don't:
+Their Hands and Faces were all badg'd with blood,
+So were their Daggers, which vnwip'd, we found
+Vpon their Pillowes: they star'd, and were distracted,
+No mans Life was to be trusted with them
+
+ Macb. O, yet I doe repent me of my furie,
+That I did kill them
+
+ Macd. Wherefore did you so?
+ Macb. Who can be wise, amaz'd, temp'rate, & furious,
+Loyall, and Neutrall, in a moment? No man:
+Th' expedition of my violent Loue
+Out-run the pawser, Reason. Here lay Duncan,
+His Siluer skinne, lac'd with His Golden Blood,
+And his gash'd Stabs, look'd like a Breach in Nature,
+For Ruines wastfull entrance: there the Murtherers,
+Steep'd in the Colours of their Trade; their Daggers
+Vnmannerly breech'd with gore: who could refraine,
+That had a heart to loue; and in that heart,
+Courage, to make's loue knowne?
+ Lady. Helpe me hence, hoa
+
+ Macd. Looke to the Lady
+
+ Mal. Why doe we hold our tongues,
+That most may clayme this argument for ours?
+ Donal. What should be spoken here,
+Where our Fate hid in an augure hole,
+May rush, and seize vs? Let's away,
+Our Teares are not yet brew'd
+
+ Mal. Nor our strong Sorrow
+Vpon the foot of Motion
+
+ Banq. Looke to the Lady:
+And when we haue our naked Frailties hid,
+That suffer in exposure; let vs meet,
+And question this most bloody piece of worke,
+To know it further. Feares and scruples shake vs:
+In the great Hand of God I stand, and thence,
+Against the vndivulg'd pretence, I fight
+Of Treasonous Mallice
+
+ Macd. And so doe I
+
+ All. So all
+
+ Macb. Let's briefely put on manly readinesse,
+And meet i'th' Hall together
+
+ All. Well contented.
+
+Exeunt.
+
+ Malc. What will you doe?
+Let's not consort with them:
+To shew an vnfelt Sorrow, is an Office
+Which the false man do's easie.
+Ile to England
+
+ Don. To Ireland, I:
+Our seperated fortune shall keepe vs both the safer:
+Where we are, there's Daggers in mens smiles;
+The neere in blood, the neerer bloody
+
+ Malc. This murtherous Shaft that's shot,
+Hath not yet lighted: and our safest way,
+Is to auoid the ayme. Therefore to Horse,
+And let vs not be daintie of leaue-taking,
+But shift away: there's warrant in that Theft,
+Which steales it selfe, when there's no mercie left.
+
+Exeunt.
+
+
+
+Scena Quarta.
+
+Enter Rosse, with an Old man.
+
+ Old man. Threescore and ten I can remember well,
+Within the Volume of which Time, I haue seene
+Houres dreadfull, and things strange: but this sore Night
+Hath trifled former knowings
+
+ Rosse. Ha, good Father,
+Thou seest the Heauens, as troubled with mans Act,
+Threatens his bloody Stage: byth' Clock 'tis Day,
+And yet darke Night strangles the trauailing Lampe:
+Is't Nights predominance, or the Dayes shame,
+That Darknesse does the face of Earth intombe,
+When liuing Light should kisse it?
+ Old man. 'Tis vnnaturall,
+Euen like the deed that's done: On Tuesday last,
+A Faulcon towring in her pride of place,
+Was by a Mowsing Owle hawkt at, and kill'd
+
+ Rosse. And Duncans Horses,
+(A thing most strange, and certaine)
+Beauteous, and swift, the Minions of their Race,
+Turn'd wilde in nature, broke their stalls, flong out,
+Contending 'gainst Obedience, as they would
+Make Warre with Mankinde
+
+ Old man. 'Tis said, they eate each other
+
+ Rosse. They did so:
+To th' amazement of mine eyes that look'd vpon't.
+Enter Macduffe.
+
+Heere comes the good Macduffe.
+How goes the world Sir, now?
+ Macd. Why see you not?
+ Ross. Is't known who did this more then bloody deed?
+ Macd. Those that Macbeth hath slaine
+
+ Ross. Alas the day,
+What good could they pretend?
+ Macd. They were subborned,
+Malcolme, and Donalbaine the Kings two Sonnes
+Are stolne away and fled, which puts vpon them
+Suspition of the deed
+
+ Rosse. 'Gainst Nature still,
+Thriftlesse Ambition, that will rauen vp
+Thine owne liues meanes: Then 'tis most like,
+The Soueraignty will fall vpon Macbeth
+
+ Macd. He is already nam'd, and gone to Scone
+To be inuested
+
+ Rosse. Where is Duncans body?
+ Macd. Carried to Colmekill,
+The Sacred Store-house of his Predecessors,
+And Guardian of their Bones
+
+ Rosse. Will you to Scone?
+ Macd. No Cosin, Ile to Fife
+
+ Rosse. Well, I will thither
+
+ Macd. Well may you see things wel done there: Adieu
+Least our old Robes sit easier then our new
+
+ Rosse. Farewell, Father
+
+ Old M. Gods benyson go with you, and with those
+That would make good of bad, and Friends of Foes.
+
+Exeunt. omnes
+
+Actus Tertius. Scena Prima.
+
+Enter Banquo.
+
+ Banq. Thou hast it now, King, Cawdor, Glamis, all,
+As the weyard Women promis'd, and I feare
+Thou playd'st most fowly for't: yet it was saide
+It should not stand in thy Posterity,
+But that my selfe should be the Roote, and Father
+Of many Kings. If there come truth from them,
+As vpon thee Macbeth, their Speeches shine,
+Why by the verities on thee made good,
+May they not be my Oracles as well,
+And set me vp in hope. But hush, no more.
+
+Senit sounded. Enter Macbeth as King, Lady Lenox, Rosse, Lords,
+and
+Attendants.
+
+ Macb. Heere's our chiefe Guest
+
+ La. If he had beene forgotten,
+It had bene as a gap in our great Feast,
+And all-thing vnbecomming
+
+ Macb. To night we hold a solemne Supper sir,
+And Ile request your presence
+
+ Banq. Let your Highnesse
+Command vpon me, to the which my duties
+Are with a most indissoluble tye
+For euer knit
+
+ Macb. Ride you this afternoone?
+ Ban. I, my good Lord
+
+ Macb. We should haue else desir'd your good aduice
+(Which still hath been both graue, and prosperous)
+In this dayes Councell: but wee'le take to morrow.
+Is't farre you ride?
+ Ban. As farre, my Lord, as will fill vp the time
+'Twixt this, and Supper. Goe not my Horse the better,
+I must become a borrower of the Night,
+For a darke houre, or twaine
+
+ Macb. Faile not our Feast
+
+ Ban. My Lord, I will not
+
+ Macb. We heare our bloody Cozens are bestow'd
+In England, and in Ireland, not confessing
+Their cruell Parricide, filling their hearers
+With strange inuention. But of that to morrow,
+When therewithall, we shall haue cause of State,
+Crauing vs ioyntly. Hye you to Horse:
+Adieu, till you returne at Night.
+Goes Fleance with you?
+ Ban. I, my good Lord: our time does call vpon's
+
+ Macb. I wish your Horses swift, and sure of foot:
+And so I doe commend you to their backs.
+Farwell.
+
+Exit Banquo.
+
+Let euery man be master of his time,
+Till seuen at Night, to make societie
+The sweeter welcome:
+We will keepe our selfe till Supper time alone:
+While then, God be with you.
+
+Exeunt. Lords.
+
+Sirrha, a word with you: Attend those men
+Our pleasure?
+ Seruant. They are, my Lord, without the Pallace
+Gate
+
+ Macb. Bring them before vs.
+
+Exit Seruant.
+
+To be thus, is nothing, but to be safely thus
+Our feares in Banquo sticke deepe,
+And in his Royaltie of Nature reignes that
+Which would be fear'd. 'Tis much he dares,
+And to that dauntlesse temper of his Minde,
+He hath a Wisdome, that doth guide his Valour,
+To act in safetie. There is none but he,
+Whose being I doe feare: and vnder him,
+My Genius is rebuk'd, as it is said
+Mark Anthonies was by Caesar. He chid the Sisters,
+When first they put the Name of King vpon me,
+And bad them speake to him. Then Prophet-like,
+They hayl'd him Father to a Line of Kings.
+Vpon my Head they plac'd a fruitlesse Crowne,
+And put a barren Scepter in my Gripe,
+Thence to be wrencht with an vnlineall Hand,
+No Sonne of mine succeeding: if't be so,
+For Banquo's Issue haue I fil'd my Minde,
+For them, the gracious Duncan haue I murther'd,
+Put Rancours in the Vessell of my Peace
+Onely for them, and mine eternall Iewell
+Giuen to the common Enemie of Man,
+To make them Kings, the Seedes of Banquo Kings.
+Rather then so, come Fate into the Lyst,
+And champion me to th' vtterance.
+Who's there?
+Enter Seruant, and two Murtherers.
+
+Now goe to the Doore, and stay there till we call.
+
+Exit Seruant.
+
+Was it not yesterday we spoke together?
+ Murth. It was, so please your Highnesse
+
+ Macb. Well then,
+Now haue you consider'd of my speeches:
+Know, that it was he, in the times past,
+Which held you so vnder fortune,
+Which you thought had been our innocent selfe.
+This I made good to you, in our last conference,
+Past in probation with you:
+How you were borne in hand, how crost:
+The Instruments: who wrought with them:
+And all things else, that might
+To halfe a Soule, and to a Notion craz'd,
+Say, Thus did Banquo
+
+ 1.Murth. You made it knowne to vs
+
+ Macb. I did so:
+And went further, which is now
+Our point of second meeting.
+Doe you finde your patience so predominant,
+In your nature, that you can let this goe?
+Are you so Gospell'd, to pray for this good man,
+And for his Issue, whose heauie hand
+Hath bow'd you to the Graue, and begger'd
+Yours for euer?
+ 1.Murth. We are men, my Liege
+
+ Macb. I, in the Catalogue ye goe for men,
+As Hounds, and Greyhounds, Mungrels, Spaniels, Curres,
+Showghes, Water-Rugs, and Demy-Wolues are clipt
+All by the Name of Dogges: the valued file
+Distinguishes the swift, the slow, the subtle,
+The House-keeper, the Hunter, euery one
+According to the gift, which bounteous Nature
+Hath in him clos'd: whereby he does receiue
+Particular addition, from the Bill,
+That writes them all alike: and so of men.
+Now, if you haue a station in the file,
+Not i'th' worst ranke of Manhood, say't,
+And I will put that Businesse in your Bosomes,
+Whose execution takes your Enemie off,
+Grapples you to the heart; and loue of vs,
+Who weare our Health but sickly in his Life,
+Which in his Death were perfect
+
+ 2.Murth. I am one, my Liege,
+Whom the vile Blowes and Buffets of the World
+Hath so incens'd, that I am recklesse what I doe,
+To spight the World
+
+ 1.Murth. And I another,
+So wearie with Disasters, tugg'd with Fortune,
+That I would set my Life on any Chance,
+To mend it, or be rid on't
+
+ Macb. Both of you know Banquo was your Enemie
+
+ Murth. True, my Lord
+
+ Macb. So is he mine: and in such bloody distance,
+That euery minute of his being, thrusts
+Against my neer'st of Life: and though I could
+With bare-fac'd power sweepe him from my sight,
+And bid my will auouch it; yet I must not,
+For certaine friends that are both his, and mine,
+Whose loues I may not drop, but wayle his fall,
+Who I my selfe struck downe: and thence it is,
+That I to your assistance doe make loue,
+Masking the Businesse from the common Eye,
+For sundry weightie Reasons
+
+ 2.Murth. We shall, my Lord,
+Performe what you command vs
+
+ 1.Murth. Though our Liues-
+ Macb. Your Spirits shine through you.
+Within this houre, at most,
+I will aduise you where to plant your selues,
+Acquaint you with the perfect Spy o'th' time,
+The moment on't, for't must be done to Night,
+And something from the Pallace: alwayes thought,
+That I require a clearenesse; and with him,
+To leaue no Rubs nor Botches in the Worke:
+ Fleans , his Sonne, that keepes him companie,
+Whose absence is no lesse materiall to me,
+Then is his Fathers, must embrace the fate
+Of that darke houre: resolue your selues apart,
+Ile come to you anon
+
+ Murth. We are resolu'd, my Lord
+
+ Macb. Ile call vpon you straight: abide within,
+It is concluded: Banquo, thy Soules flight,
+If it finde Heauen, must finde it out to Night.
+
+Exeunt.
+
+
+Scena Secunda.
+
+Enter Macbeths Lady, and a Seruant.
+
+ Lady. Is Banquo gone from Court?
+ Seruant. I, Madame, but returnes againe to Night
+
+ Lady. Say to the King, I would attend his leysure,
+For a few words
+
+ Seruant. Madame, I will.
+Enter.
+
+ Lady. Nought's had, all's spent.
+Where our desire is got without content:
+'Tis safer, to be that which we destroy,
+Then by destruction dwell in doubtfull ioy.
+Enter Macbeth.
+
+How now, my Lord, why doe you keepe alone?
+Of sorryest Fancies your Companions making,
+Vsing those Thoughts, which should indeed haue dy'd
+With them they thinke on: things without all remedie
+Should be without regard: what's done, is done
+
+ Macb. We haue scorch'd the Snake, not kill'd it:
+Shee'le close, and be her selfe, whilest our poore Mallice
+Remaines in danger of her former Tooth.
+But let the frame of things dis-ioynt,
+Both the Worlds suffer,
+Ere we will eate our Meale in feare, and sleepe
+In the affliction of these terrible Dreames,
+That shake vs Nightly: Better be with the dead,
+Whom we, to gayne our peace, haue sent to peace,
+Then on the torture of the Minde to lye
+In restlesse extasie.
+Duncane is in his Graue:
+After Lifes fitfull Feuer, he sleepes well,
+Treason ha's done his worst: nor Steele, nor Poyson,
+Mallice domestique, forraine Leuie, nothing,
+Can touch him further
+
+ Lady. Come on:
+Gentle my Lord, sleeke o're your rugged Lookes,
+Be bright and Iouiall among your Guests to Night
+
+ Macb. So shall I Loue, and so I pray be you:
+Let your remembrance apply to Banquo,
+Present him Eminence, both with Eye and Tongue:
+Vnsafe the while, that wee must laue
+Our Honors in these flattering streames,
+And make our Faces Vizards to our Hearts,
+Disguising what they are
+
+ Lady. You must leaue this
+
+ Macb. O, full of Scorpions is my Minde, deare Wife:
+Thou know'st, that Banquo and his Fleans liues
+
+ Lady. But in them, Natures Coppie's not eterne
+
+ Macb. There's comfort yet, they are assaileable,
+Then be thou iocund: ere the Bat hath flowne
+His Cloyster'd flight, ere to black Heccats summons
+The shard-borne Beetle, with his drowsie hums,
+Hath rung Nights yawning Peale,
+There shall be done a deed of dreadfull note
+
+ Lady. What's to be done?
+ Macb. Be innocent of the knowledge, dearest Chuck,
+Till thou applaud the deed: Come, seeling Night,
+Skarfe vp the tender Eye of pittifull Day,
+And with thy bloodie and inuisible Hand
+Cancell and teare to pieces that great Bond,
+Which keepes me pale. Light thickens,
+And the Crow makes Wing toth' Rookie Wood:
+Good things of Day begin to droope, and drowse,
+Whiles Nights black Agents to their Prey's doe rowse.
+Thou maruell'st at my words: but hold thee still,
+Things bad begun, make strong themselues by ill:
+So prythee goe with me.
+
+Exeunt.
+
+
+Scena Tertia.
+
+Enter three Murtherers.
+
+ 1. But who did bid thee ioyne with vs?
+ 3. Macbeth
+
+ 2. He needes not our mistrust, since he deliuers
+Our Offices, and what we haue to doe,
+To the direction iust
+
+ 1. Then stand with vs:
+The West yet glimmers with some streakes of Day.
+Now spurres the lated Traueller apace,
+To gayne the timely Inne, and neere approches
+The subiect of our Watch
+
+ 3. Hearke, I heare Horses
+
+ Banquo within. Giue vs a Light there, hoa
+
+ 2. Then 'tis hee:
+The rest, that are within the note of expectation,
+Alreadie are i'th' Court
+
+ 1. His Horses goe about
+
+ 3. Almost a mile: but he does vsually,
+So all men doe, from hence toth' Pallace Gate
+Make it their Walke.
+Enter Banquo and Fleans, with a Torch.
+
+ 2. A Light, a Light
+
+ 3. 'Tis hee
+
+ 1. Stand too't
+
+ Ban. It will be Rayne to Night
+
+ 1. Let it come downe
+
+ Ban. O, Trecherie!
+Flye good Fleans, flye, flye, flye,
+Thou may'st reuenge. O Slaue!
+ 3. Who did strike out the Light?
+ 1. Was't not the way?
+ 3. There's but one downe: the Sonne is fled
+
+ 2. We haue lost
+Best halfe of our Affaire
+
+ 1. Well, let's away, and say how much is done.
+
+Exeunt.
+
+
+Scaena Quarta.
+
+Banquet prepar'd. Enter Macbeth, Lady, Rosse, Lenox, Lords, and
+Attendants.
+
+ Macb. You know your owne degrees, sit downe:
+At first and last, the hearty welcome
+
+ Lords. Thankes to your Maiesty
+
+ Macb. Our selfe will mingle with Society,
+And play the humble Host:
+Our Hostesse keepes her State, but in best time
+We will require her welcome
+
+ La. Pronounce it for me Sir, to all our Friends,
+For my heart speakes, they are welcome.
+Enter first Murtherer.
+
+ Macb. See they encounter thee with their harts thanks
+Both sides are euen: heere Ile sit i'th' mid'st,
+Be large in mirth, anon wee'l drinke a Measure
+The Table round. There's blood vpon thy face
+
+ Mur. 'Tis Banquo's then
+
+ Macb. 'Tis better thee without, then he within.
+Is he dispatch'd?
+ Mur. My Lord his throat is cut, that I did for him
+
+ Mac. Thou art the best o'th' Cut-throats,
+Yet hee's good that did the like for Fleans:
+If thou did'st it, thou art the Non-pareill
+
+ Mur. Most Royall Sir
+Fleans is scap'd
+
+ Macb. Then comes my Fit againe:
+I had else beene perfect;
+Whole as the Marble, founded as the Rocke,
+As broad, and generall, as the casing Ayre:
+But now I am cabin'd, crib'd, confin'd, bound in
+To sawcy doubts, and feares. But Banquo's safe?
+ Mur. I, my good Lord: safe in a ditch he bides,
+With twenty trenched gashes on his head;
+The least a Death to Nature
+
+ Macb. Thankes for that:
+There the growne Serpent lyes, the worme that's fled
+Hath Nature that in time will Venom breed,
+No teeth for th' present. Get thee gone, to morrow
+Wee'l heare our selues againe.
+
+Exit Murderer.
+
+ Lady. My Royall Lord,
+You do not giue the Cheere, the Feast is sold
+That is not often vouch'd, while 'tis a making:
+'Tis giuen, with welcome: to feede were best at home:
+From thence, the sawce to meate is Ceremony,
+Meeting were bare without it.
+Enter the Ghost of Banquo, and sits in Macbeths place.
+
+ Macb. Sweet Remembrancer:
+Now good digestion waite on Appetite,
+And health on both
+
+ Lenox. May't please your Highnesse sit
+
+ Macb. Here had we now our Countries Honor, roof'd,
+Were the grac'd person of our Banquo present:
+Who, may I rather challenge for vnkindnesse,
+Then pitty for Mischance
+
+ Rosse. His absence (Sir)
+Layes blame vpon his promise. Pleas't your Highnesse
+To grace vs with your Royall Company?
+ Macb. The Table's full
+
+ Lenox. Heere is a place reseru'd Sir
+
+ Macb. Where?
+ Lenox. Heere my good Lord.
+What is't that moues your Highnesse?
+ Macb. Which of you haue done this?
+ Lords. What, my good Lord?
+ Macb. Thou canst not say I did it: neuer shake
+Thy goary lockes at me
+
+ Rosse. Gentlemen rise, his Highnesse is not well
+
+ Lady. Sit worthy Friends: my Lord is often thus,
+And hath beene from his youth. Pray you keepe Seat,
+The fit is momentary, vpon a thought
+He will againe be well. If much you note him
+You shall offend him, and extend his Passion,
+Feed, and regard him not. Are you a man?
+ Macb. I, and a bold one, that dare looke on that
+Which might appall the Diuell
+
+ La. O proper stuffe:
+This is the very painting of your feare:
+This is the Ayre-drawne-Dagger which you said
+Led you to Duncan. O, these flawes and starts
+(Impostors to true feare) would well become
+A womans story, at a Winters fire
+Authoriz'd by her Grandam: shame it selfe,
+Why do you make such faces? When all's done
+You looke but on a stoole
+
+ Macb. Prythee see there:
+Behold, looke, loe, how say you:
+Why what care I, if thou canst nod, speake too.
+If Charnell houses, and our Graues must send
+Those that we bury, backe; our Monuments
+Shall be the Mawes of Kytes
+
+ La. What? quite vnmann'd in folly
+
+ Macb. If I stand heere, I saw him
+
+ La. Fie for shame
+
+ Macb. Blood hath bene shed ere now, i'th' olden time
+Ere humane Statute purg'd the gentle Weale:
+I, and since too, Murthers haue bene perform'd
+Too terrible for the eare. The times has bene,
+That when the Braines were out, the man would dye,
+And there an end: But now they rise againe
+With twenty mortall murthers on their crownes,
+And push vs from our stooles. This is more strange
+Then such a murther is
+
+ La. My worthy Lord
+Your Noble Friends do lacke you
+
+ Macb. I do forget:
+Do not muse at me my most worthy Friends,
+I haue a strange infirmity, which is nothing
+To those that know me. Come, loue and health to all,
+Then Ile sit downe: Giue me some Wine, fill full:
+Enter Ghost.
+
+I drinke to th' generall ioy o'th' whole Table,
+And to our deere Friend Banquo, whom we misse:
+Would he were heere: to all, and him we thirst,
+And all to all
+
+ Lords. Our duties, and the pledge
+
+ Mac. Auant, & quit my sight, let the earth hide thee:
+Thy bones are marrowlesse, thy blood is cold:
+Thou hast no speculation in those eyes
+Which thou dost glare with
+
+ La. Thinke of this good Peeres
+But as a thing of Custome: 'Tis no other,
+Onely it spoyles the pleasure of the time
+
+ Macb. What man dare, I dare:
+Approach thou like the rugged Russian Beare,
+The arm'd Rhinoceros, or th' Hircan Tiger,
+Take any shape but that, and my firme Nerues
+Shall neuer tremble. Or be aliue againe,
+And dare me to the Desart with thy Sword:
+If trembling I inhabit then, protest mee
+The Baby of a Girle. Hence horrible shadow,
+Vnreall mock'ry hence. Why so, being gone
+I am a man againe: pray you sit still
+
+ La. You haue displac'd the mirth,
+Broke the good meeting, with most admir'd disorder
+
+ Macb. Can such things be,
+And ouercome vs like a Summers Clowd,
+Without our speciall wonder? You make me strange
+Euen to the disposition that I owe,
+When now I thinke you can behold such sights,
+And keepe the naturall Rubie of your Cheekes,
+When mine is blanch'd with feare
+
+ Rosse. What sights, my Lord?
+ La. I pray you speake not: he growes worse & worse
+Question enrages him: at once, goodnight.
+Stand not vpon the order of your going,
+But go at once
+
+ Len. Good night, and better health
+Attend his Maiesty
+
+ La. A kinde goodnight to all.
+
+Exit Lords.
+
+ Macb. It will haue blood they say:
+Blood will haue Blood:
+Stones haue beene knowne to moue, & Trees to speake:
+Augures, and vnderstood Relations, haue
+By Maggot Pyes, & Choughes, & Rookes brought forth
+The secret'st man of Blood. What is the night?
+ La. Almost at oddes with morning, which is which
+
+ Macb. How say'st thou that Macduff denies his person
+At our great bidding
+
+ La. Did you send to him Sir?
+ Macb. I heare it by the way: But I will send:
+There's not a one of them but in his house
+I keepe a Seruant Feed. I will to morrow
+(And betimes I will) to the weyard Sisters.
+More shall they speake: for now I am bent to know
+By the worst meanes, the worst, for mine owne good,
+All causes shall giue way. I am in blood
+Stept in so farre, that should I wade no more,
+Returning were as tedious as go ore:
+Strange things I haue in head, that will to hand,
+Which must be acted, ere they may be scand
+
+ La. You lacke the season of all Natures, sleepe
+
+ Macb. Come, wee'l to sleepe: My strange & self-abuse
+Is the initiate feare, that wants hard vse:
+We are yet but yong indeed.
+
+Exeunt.
+
+
+Scena Quinta.
+
+Thunder. Enter the three Witches, meeting Hecat.
+
+ 1. Why how now Hecat, you looke angerly?
+ Hec. Haue I not reason (Beldams) as you are?
+Sawcy, and ouer-bold, how did you dare
+To Trade, and Trafficke with Macbeth,
+In Riddles, and Affaires of death;
+And I the Mistris of your Charmes,
+The close contriuer of all harmes,
+Was neuer call'd to beare my part,
+Or shew the glory of our Art?
+And which is worse, all you haue done
+Hath bene but for a wayward Sonne,
+Spightfull, and wrathfull, who (as others do)
+Loues for his owne ends, not for you.
+But make amends now: Get you gon,
+And at the pit of Acheron
+Meete me i'th' Morning: thither he
+Will come, to know his Destinie.
+Your Vessels, and your Spels prouide,
+Your Charmes, and euery thing beside;
+I am for th' Ayre: This night Ile spend
+Vnto a dismall, and a Fatall end.
+Great businesse must be wrought ere Noone.
+Vpon the Corner of the Moone
+There hangs a vap'rous drop, profound,
+Ile catch it ere it come to ground;
+And that distill'd by Magicke slights,
+Shall raise such Artificiall Sprights,
+As by the strength of their illusion,
+Shall draw him on to his Confusion.
+He shall spurne Fate, scorne Death, and beare
+His hopes 'boue Wisedome, Grace, and Feare:
+And you all know, Security
+Is Mortals cheefest Enemie.
+
+Musicke, and a Song.
+
+Hearke, I am call'd: my little Spirit see
+Sits in Foggy cloud, and stayes for me.
+
+Sing within. Come away, come away, &c.
+
+ 1 Come, let's make hast, shee'l soone be
+Backe againe.
+
+Exeunt.
+
+
+Scaena Sexta.
+
+Enter Lenox, and another Lord.
+
+ Lenox. My former Speeches,
+Haue but hit your Thoughts
+Which can interpret farther: Onely I say
+Things haue bin strangely borne. The gracious Duncan
+Was pittied of Macbeth: marry he was dead:
+And the right valiant Banquo walk'd too late,
+Whom you may say (if't please you) Fleans kill'd,
+For Fleans fled: Men must not walke too late.
+Who cannot want the thought, how monstrous
+It was for Malcolme, and for Donalbane
+To kill their gracious Father? Damned Fact,
+How it did greeue Macbeth? Did he not straight
+In pious rage, the two delinquents teare,
+That were the Slaues of drinke, and thralles of sleepe?
+Was not that Nobly done? I, and wisely too:
+For 'twould haue anger'd any heart aliue
+To heare the men deny't. So that I say,
+He ha's borne all things well, and I do thinke,
+That had he Duncans Sonnes vnder his Key,
+(As, and't please Heauen he shall not) they should finde
+What 'twere to kill a Father: So should Fleans.
+But peace; for from broad words, and cause he fayl'd
+His presence at the Tyrants Feast, I heare
+Macduffe liues in disgrace. Sir, can you tell
+Where he bestowes himselfe?
+ Lord. The Sonnes of Duncane
+(From whom this Tyrant holds the due of Birth)
+Liues in the English Court, and is receyu'd
+Of the most Pious Edward, with such grace,
+That the maleuolence of Fortune, nothing
+Takes from his high respect. Thither Macduffe
+Is gone, to pray the Holy King, vpon his ayd
+To wake Northumberland, and warlike Seyward,
+That by the helpe of these (with him aboue)
+To ratifie the Worke) we may againe
+Giue to our Tables meate, sleepe to our Nights:
+Free from our Feasts, and Banquets bloody kniues;
+Do faithfull Homage, and receiue free Honors,
+All which we pine for now. And this report
+Hath so exasperate their King, that hee
+Prepares for some attempt of Warre
+
+ Len. Sent he to Macduffe?
+ Lord. He did: and with an absolute Sir, not I
+The clowdy Messenger turnes me his backe,
+And hums; as who should say, you'l rue the time
+That clogges me with this Answer
+
+ Lenox. And that well might
+Aduise him to a Caution, t' hold what distance
+His wisedome can prouide. Some holy Angell
+Flye to the Court of England, and vnfold
+His Message ere he come, that a swift blessing
+May soone returne to this our suffering Country,
+Vnder a hand accurs'd
+
+ Lord. Ile send my Prayers with him.
+
+Exeunt.
+
+Actus Quartus. Scena Prima.
+
+Thunder. Enter the three Witches.
+
+ 1 Thrice the brinded Cat hath mew'd
+
+ 2 Thrice, and once the Hedge-Pigge whin'd
+
+ 3 Harpier cries, 'tis time, 'tis time
+
+ 1 Round about the Caldron go:
+In the poysond Entrailes throw
+Toad, that vnder cold stone,
+Dayes and Nights, ha's thirty one:
+Sweltred Venom sleeping got,
+Boyle thou first i'th' charmed pot
+
+ All. Double, double, toile and trouble;
+Fire burne, and Cauldron bubble
+
+ 2 Fillet of a Fenny Snake,
+In the Cauldron boyle and bake:
+Eye of Newt, and Toe of Frogge,
+Wooll of Bat, and Tongue of Dogge:
+Adders Forke, and Blinde-wormes Sting,
+Lizards legge, and Howlets wing:
+For a Charme of powrefull trouble,
+Like a Hell-broth, boyle and bubble
+
+ All. Double, double, toyle and trouble,
+Fire burne, and Cauldron bubble
+
+ 3 Scale of Dragon, Tooth of Wolfe,
+Witches Mummey, Maw, and Gulfe
+Of the rauin'd salt Sea sharke:
+Roote of Hemlocke, digg'd i'th' darke:
+Liuer of Blaspheming Iew,
+Gall of Goate, and Slippes of Yew,
+Sliuer'd in the Moones Ecclipse:
+Nose of Turke, and Tartars lips:
+Finger of Birth-strangled Babe,
+Ditch-deliuer'd by a Drab,
+Make the Grewell thicke, and slab.
+Adde thereto a Tigers Chawdron,
+For th' Ingredience of our Cawdron
+
+ All. Double, double, toyle and trouble,
+Fire burne, and Cauldron bubble
+
+ 2 Coole it with a Baboones blood,
+Then the Charme is firme and good.
+Enter Hecat, and the other three Witches.
+
+ Hec. O well done: I commend your paines,
+And euery one shall share i'th' gaines:
+And now about the Cauldron sing
+Like Elues and Fairies in a Ring,
+Inchanting all that you put in.
+
+Musicke and a Song. Blacke Spirits, &c.
+
+ 2 By the pricking of my Thumbes,
+Something wicked this way comes:
+Open Lockes, who euer knockes.
+Enter Macbeth.
+
+ Macb. How now you secret, black, & midnight Hags?
+What is't you do?
+ All. A deed without a name
+
+ Macb. I coniure you, by that which you Professe,
+(How ere you come to know it) answer me:
+Though you vntye the Windes, and let them fight
+Against the Churches: Though the yesty Waues
+Confound and swallow Nauigation vp:
+Though bladed Corne be lodg'd, & Trees blown downe,
+Though Castles topple on their Warders heads:
+Though Pallaces, and Pyramids do slope
+Their heads to their Foundations: Though the treasure
+Of Natures Germaine, tumble altogether,
+Euen till destruction sicken: Answer me
+To what I aske you
+
+ 1 Speake
+
+ 2 Demand
+
+ 3 Wee'l answer
+
+ 1 Say, if th'hadst rather heare it from our mouthes,
+Or from our Masters
+
+ Macb. Call 'em: let me see 'em
+
+ 1 Powre in Sowes blood, that hath eaten
+Her nine Farrow: Greaze that's sweaten
+From the Murderers Gibbet, throw
+Into the Flame
+
+ All. Come high or low:
+Thy Selfe and Office deaftly show.
+Thunder. 1. Apparation, an Armed Head.
+
+ Macb. Tell me, thou vnknowne power
+
+ 1 He knowes thy thought:
+Heare his speech, but say thou nought
+
+ 1 Appar. Macbeth, Macbeth, Macbeth:
+Beware Macduffe,
+Beware the Thane of Fife: dismisse me. Enough.
+
+He Descends.
+
+ Macb. What ere thou art, for thy good caution, thanks
+Thou hast harp'd my feare aright. But one word more
+
+ 1 He will not be commanded: heere's another
+More potent then the first.
+
+Thunder. 2 Apparition, a Bloody Childe.
+
+ 2 Appar. Macbeth, Macbeth, Macbeth
+
+ Macb. Had I three eares, Il'd heare thee
+
+ Appar. Be bloody, bold, & resolute:
+Laugh to scorne
+The powre of man: For none of woman borne
+Shall harme Macbeth.
+
+Descends.
+
+ Mac. Then liue Macduffe: what need I feare of thee?
+But yet Ile make assurance: double sure,
+And take a Bond of Fate: thou shalt not liue,
+That I may tell pale-hearted Feare, it lies;
+And sleepe in spight of Thunder.
+
+Thunder 3 Apparation, a Childe Crowned, with a Tree in his hand.
+
+What is this, that rises like the issue of a King,
+And weares vpon his Baby-brow, the round
+And top of Soueraignty?
+ All. Listen, but speake not too't
+
+ 3 Appar. Be Lyon metled, proud, and take no care:
+Who chafes, who frets, or where Conspirers are:
+Macbeth shall neuer vanquish'd be, vntill
+Great Byrnam Wood, to high Dunsmane Hill
+Shall come against him.
+
+Descend.
+
+ Macb. That will neuer bee:
+Who can impresse the Forrest, bid the Tree
+Vnfixe his earth-bound Root? Sweet boadments, good:
+Rebellious dead, rise neuer till the Wood
+Of Byrnan rise, and our high plac'd Macbeth
+Shall liue the Lease of Nature, pay his breath
+To time, and mortall Custome. Yet my Hart
+Throbs to know one thing: Tell me, if your Art
+Can tell so much: Shall Banquo's issue euer
+Reigne in this Kingdome?
+ All. Seeke to know no more
+
+ Macb. I will be satisfied. Deny me this,
+And an eternall Curse fall on you: Let me know.
+Why sinkes that Caldron? & what noise is this?
+
+Hoboyes
+
+ 1 Shew
+
+ 2 Shew
+
+ 3 Shew
+
+ All. Shew his Eyes, and greeue his Hart,
+Come like shadowes, so depart.
+
+A shew of eight Kings, and Banquo last, with a glasse in his hand.
+
+ Macb. Thou art too like the Spirit of Banquo: Down:
+Thy Crowne do's seare mine Eye-bals. And thy haire
+Thou other Gold-bound-brow, is like the first:
+A third, is like the former. Filthy Hagges,
+Why do you shew me this? - A fourth? Start eyes!
+What will the Line stretch out to'th' cracke of Doome?
+Another yet? A seauenth? Ile see no more:
+And yet the eighth appeares, who beares a glasse,
+Which shewes me many more: and some I see,
+That two-fold Balles, and trebble Scepters carry.
+Horrible sight: Now I see 'tis true,
+For the Blood-bolter'd Banquo smiles vpon me,
+And points at them for his. What? is this so?
+ 1 I Sir, all this is so. But why
+Stands Macbeth thus amazedly?
+Come Sisters, cheere we vp his sprights,
+And shew the best of our delights.
+Ile Charme the Ayre to giue a sound,
+While you performe your Antique round:
+That this great King may kindly say,
+Our duties, did his welcome pay.
+
+Musicke. The Witches Dance, and vanish.
+
+ Macb. Where are they? Gone?
+Let this pernitious houre,
+Stand aye accursed in the Kalender.
+Come in, without there.
+Enter Lenox.
+
+ Lenox. What's your Graces will
+
+ Macb. Saw you the Weyard Sisters?
+ Lenox. No my Lord
+
+ Macb. Came they not by you?
+ Lenox. No indeed my Lord
+
+ Macb. Infected be the Ayre whereon they ride,
+And damn'd all those that trust them. I did heare
+The gallopping of Horse. Who was't came by?
+ Len. 'Tis two or three my Lord, that bring you word:
+Macduff is fled to England
+
+ Macb. Fled to England?
+ Len. I, my good Lord
+
+ Macb. Time, thou anticipat'st my dread exploits:
+The flighty purpose neuer is o're-tooke
+Vnlesse the deed go with it. From this moment,
+The very firstlings of my heart shall be
+The firstlings of my hand. And euen now
+To Crown my thoughts with Acts: be it thoght & done:
+The Castle of Macduff, I will surprize.
+Seize vpon Fife; giue to th' edge o'th' Sword
+His Wife, his Babes, and all vnfortunate Soules
+That trace him in his Line. No boasting like a Foole,
+This deed Ile do, before this purpose coole,
+But no more sights. Where are these Gentlemen?
+Come bring me where they are.
+
+Exeunt.
+
+Scena Secunda.
+
+Enter Macduffes Wife, her Son, and Rosse.
+
+ Wife. What had he done, to make him fly the Land?
+ Rosse. You must haue patience Madam
+
+ Wife. He had none:
+His flight was madnesse: when our Actions do not,
+Our feares do make vs Traitors
+
+ Rosse. You know not
+Whether it was his wisedome, or his feare
+
+ Wife. Wisedom? to leaue his wife, to leaue his Babes,
+His Mansion, and his Titles, in a place
+From whence himselfe do's flye? He loues vs not,
+He wants the naturall touch. For the poore Wren
+(The most diminitiue of Birds) will fight,
+Her yong ones in her N
<TRUNCATED>
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/sort_by_value.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/sort_by_value.txt b/crunch-core/src/it/resources/sort_by_value.txt
new file mode 100644
index 0000000..73f7d11
--- /dev/null
+++ b/crunch-core/src/it/resources/sort_by_value.txt
@@ -0,0 +1,5 @@
+A 2
+B 1
+C 3
+D 2
+E 1
[42/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/MapsIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/MapsIT.java b/crunch-core/src/it/java/org/apache/crunch/MapsIT.java
new file mode 100644
index 0000000..5b3187b
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/MapsIT.java
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.assertThat;
+
+import java.util.Map;
+
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Maps;
+
+public class MapsIT {
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testWritables() throws Exception {
+ run(WritableTypeFamily.getInstance(), tmpDir);
+ }
+
+ @Test
+ public void testAvros() throws Exception {
+ run(AvroTypeFamily.getInstance(), tmpDir);
+ }
+
+ public static void run(PTypeFamily typeFamily, TemporaryPath tmpDir) throws Exception {
+ Pipeline pipeline = new MRPipeline(MapsIT.class, tmpDir.getDefaultConfiguration());
+ String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
+ PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
+ Iterable<Pair<String, Map<String, Long>>> output = shakespeare
+ .parallelDo(new DoFn<String, Pair<String, Map<String, Long>>>() {
+ @Override
+ public void process(String input, Emitter<Pair<String, Map<String, Long>>> emitter) {
+ String last = null;
+ for (String word : input.toLowerCase().split("\\W+")) {
+ if (!word.isEmpty()) {
+ String firstChar = word.substring(0, 1);
+ if (last != null) {
+ Map<String, Long> cc = ImmutableMap.of(firstChar, 1L);
+ emitter.emit(Pair.of(last, cc));
+ }
+ last = firstChar;
+ }
+ }
+ }
+ }, typeFamily.tableOf(typeFamily.strings(), typeFamily.maps(typeFamily.longs()))).groupByKey()
+ .combineValues(new CombineFn<String, Map<String, Long>>() {
+ @Override
+ public void process(Pair<String, Iterable<Map<String, Long>>> input,
+ Emitter<Pair<String, Map<String, Long>>> emitter) {
+ Map<String, Long> agg = Maps.newHashMap();
+ for (Map<String, Long> in : input.second()) {
+ for (Map.Entry<String, Long> e : in.entrySet()) {
+ if (!agg.containsKey(e.getKey())) {
+ agg.put(e.getKey(), e.getValue());
+ } else {
+ agg.put(e.getKey(), e.getValue() + agg.get(e.getKey()));
+ }
+ }
+ }
+ emitter.emit(Pair.of(input.first(), agg));
+ }
+ }).materialize();
+
+ boolean passed = false;
+ for (Pair<String, Map<String, Long>> v : output) {
+ if (v.first().equals("k") && v.second().get("n") == 8L) {
+ passed = true;
+ break;
+ }
+ }
+ pipeline.done();
+
+ assertThat(passed, is(true));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/MaterializeIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/MaterializeIT.java b/crunch-core/src/it/java/org/apache/crunch/MaterializeIT.java
new file mode 100644
index 0000000..d064993
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/MaterializeIT.java
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertTrue;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.crunch.fn.FilterFns;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.Assume;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class MaterializeIT {
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testMaterializeInput_Writables() throws IOException {
+ runMaterializeInput(new MRPipeline(MaterializeIT.class, tmpDir.getDefaultConfiguration()),
+ WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testMaterializeInput_Avro() throws IOException {
+ runMaterializeInput(new MRPipeline(MaterializeIT.class, tmpDir.getDefaultConfiguration()),
+ AvroTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testMaterializeInput_InMemoryWritables() throws IOException {
+ runMaterializeInput(MemPipeline.getInstance(), WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testMaterializeInput_InMemoryAvro() throws IOException {
+ runMaterializeInput(MemPipeline.getInstance(), AvroTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testMaterializeEmptyIntermediate_Writables() throws IOException {
+ runMaterializeEmptyIntermediate(
+ new MRPipeline(MaterializeIT.class, tmpDir.getDefaultConfiguration()),
+ WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testMaterializeEmptyIntermediate_Avro() throws IOException {
+ runMaterializeEmptyIntermediate(
+ new MRPipeline(MaterializeIT.class, tmpDir.getDefaultConfiguration()),
+ AvroTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testMaterializeEmptyIntermediate_InMemoryWritables() throws IOException {
+ runMaterializeEmptyIntermediate(MemPipeline.getInstance(), WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testMaterializeEmptyIntermediate_InMemoryAvro() throws IOException {
+ runMaterializeEmptyIntermediate(MemPipeline.getInstance(), AvroTypeFamily.getInstance());
+ }
+
+ public void runMaterializeInput(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
+ List<String> expectedContent = Lists.newArrayList("b", "c", "a", "e");
+ String inputPath = tmpDir.copyResourceFileName("set1.txt");
+
+ PCollection<String> lines = pipeline.readTextFile(inputPath);
+ assertEquals(expectedContent, Lists.newArrayList(lines.materialize()));
+ pipeline.done();
+ }
+
+ public void runMaterializeEmptyIntermediate(Pipeline pipeline, PTypeFamily typeFamily)
+ throws IOException {
+ String inputPath = tmpDir.copyResourceFileName("set1.txt");
+ PCollection<String> empty = pipeline.readTextFile(inputPath).filter(FilterFns.<String>REJECT_ALL());
+
+ assertTrue(Lists.newArrayList(empty.materialize()).isEmpty());
+ pipeline.done();
+ }
+
+ static class StringToStringWrapperPersonPairMapFn extends MapFn<String, Pair<StringWrapper, Person>> {
+
+ @Override
+ public Pair<StringWrapper, Person> map(String input) {
+ Person person = new Person();
+ person.name = input;
+ person.age = 42;
+ person.siblingnames = Lists.<CharSequence> newArrayList();
+ return Pair.of(new StringWrapper(input), person);
+ }
+
+ }
+
+ @Test
+ public void testMaterializeAvroPersonAndReflectsPair_GroupedTable() throws IOException {
+ Assume.assumeTrue(Avros.CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS);
+ Pipeline pipeline = new MRPipeline(MaterializeIT.class);
+ List<Pair<StringWrapper, Person>> pairList = Lists.newArrayList(pipeline
+ .readTextFile(tmpDir.copyResourceFileName("set1.txt"))
+ .parallelDo(new StringToStringWrapperPersonPairMapFn(),
+ Avros.pairs(Avros.reflects(StringWrapper.class), Avros.records(Person.class)))
+ .materialize());
+
+ // We just need to make sure this doesn't crash
+ assertEquals(4, pairList.size());
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/MaterializeToMapIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/MaterializeToMapIT.java b/crunch-core/src/it/java/org/apache/crunch/MaterializeToMapIT.java
new file mode 100644
index 0000000..7fef30e
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/MaterializeToMapIT.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableList;
+
+public class MaterializeToMapIT {
+
+ static final ImmutableList<Pair<Integer, String>> kvPairs = ImmutableList.of(Pair.of(0, "a"), Pair.of(1, "b"),
+ Pair.of(2, "c"), Pair.of(3, "e"));
+
+ public void assertMatches(Map<Integer, String> m) {
+ for (Integer k : m.keySet()) {
+ assertEquals(kvPairs.get(k).second(), m.get(k));
+ }
+ }
+
+ @Test
+ public void testMemMaterializeToMap() {
+ assertMatches(MemPipeline.tableOf(kvPairs).materializeToMap());
+ }
+
+ private static class Set1Mapper extends MapFn<String, Pair<Integer, String>> {
+ @Override
+ public Pair<Integer, String> map(String input) {
+
+ int k = -1;
+ if (input.equals("a"))
+ k = 0;
+ else if (input.equals("b"))
+ k = 1;
+ else if (input.equals("c"))
+ k = 2;
+ else if (input.equals("e"))
+ k = 3;
+ return Pair.of(k, input);
+ }
+ }
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testMRMaterializeToMap() throws IOException {
+ Pipeline p = new MRPipeline(MaterializeToMapIT.class, tmpDir.getDefaultConfiguration());
+ String inputFile = tmpDir.copyResourceFileName("set1.txt");
+ PCollection<String> c = p.readTextFile(inputFile);
+ PTypeFamily tf = c.getTypeFamily();
+ PTable<Integer, String> t = c.parallelDo(new Set1Mapper(), tf.tableOf(tf.ints(), tf.strings()));
+ Map<Integer, String> m = t.materializeToMap();
+ assertMatches(m);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/MultipleOutputIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/MultipleOutputIT.java b/crunch-core/src/it/java/org/apache/crunch/MultipleOutputIT.java
new file mode 100644
index 0000000..1a85b6a
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/MultipleOutputIT.java
@@ -0,0 +1,175 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.At;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.apache.crunch.types.writable.Writables;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+import com.google.common.io.Files;
+
+public class MultipleOutputIT {
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ public static PCollection<String> evenCountLetters(PCollection<String> words, PTypeFamily typeFamily) {
+ return words.parallelDo("even", new FilterFn<String>() {
+
+ @Override
+ public boolean accept(String input) {
+ return input.length() % 2 == 0;
+ }
+ }, typeFamily.strings());
+ }
+
+ public static PCollection<String> oddCountLetters(PCollection<String> words, PTypeFamily typeFamily) {
+ return words.parallelDo("odd", new FilterFn<String>() {
+
+ @Override
+ public boolean accept(String input) {
+ return input.length() % 2 != 0;
+ }
+ }, typeFamily.strings());
+
+ }
+
+ public static PTable<String, Long> substr(PTable<String, Long> ptable) {
+ return ptable.parallelDo(new DoFn<Pair<String, Long>, Pair<String, Long>>() {
+ public void process(Pair<String, Long> input, Emitter<Pair<String, Long>> emitter) {
+ if (input.first().length() > 0) {
+ emitter.emit(Pair.of(input.first().substring(0, 1), input.second()));
+ }
+ }
+ }, ptable.getPTableType());
+ }
+
+ @Test
+ public void testWritables() throws IOException {
+ run(new MRPipeline(MultipleOutputIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testAvro() throws IOException {
+ run(new MRPipeline(MultipleOutputIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testParallelDosFused() throws IOException {
+
+ PipelineResult result = run(new MRPipeline(MultipleOutputIT.class, tmpDir.getDefaultConfiguration()),
+ WritableTypeFamily.getInstance());
+
+ // Ensure our multiple outputs were fused into a single job.
+ assertEquals("parallel Dos not fused into a single job", 1, result.getStageResults().size());
+ }
+
+ public PipelineResult run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
+ String inputPath = tmpDir.copyResourceFileName("letters.txt");
+ String outputPathEven = tmpDir.getFileName("even");
+ String outputPathOdd = tmpDir.getFileName("odd");
+
+ PCollection<String> words = pipeline.read(At.textFile(inputPath, typeFamily.strings()));
+
+ PCollection<String> evenCountWords = evenCountLetters(words, typeFamily);
+ PCollection<String> oddCountWords = oddCountLetters(words, typeFamily);
+ pipeline.writeTextFile(evenCountWords, outputPathEven);
+ pipeline.writeTextFile(oddCountWords, outputPathOdd);
+
+ PipelineResult result = pipeline.done();
+
+ checkFileContents(outputPathEven, Arrays.asList("bb"));
+ checkFileContents(outputPathOdd, Arrays.asList("a"));
+
+ return result;
+ }
+
+ /**
+ * Mutates the state of an input and then emits the mutated object.
+ */
+ static class AppendFn extends DoFn<StringWrapper, StringWrapper> {
+
+ private String value;
+
+ public AppendFn(String value) {
+ this.value = value;
+ }
+
+ @Override
+ public void process(StringWrapper input, Emitter<StringWrapper> emitter) {
+ input.setValue(input.getValue() + value);
+ emitter.emit(input);
+ }
+
+ }
+
+ /**
+ * Fusing multiple pipelines has a risk of running into object reuse bugs.
+ * This test verifies that mutating the state of an object that is passed
+ * through multiple streams of a pipeline doesn't allow one stream to affect
+ * another.
+ */
+ @Test
+ public void testFusedMappersObjectReuseBug() throws IOException {
+ Pipeline pipeline = new MRPipeline(MultipleOutputIT.class, tmpDir.getDefaultConfiguration());
+ PCollection<StringWrapper> stringWrappers = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt"))
+ .parallelDo(new StringWrapper.StringToStringWrapperMapFn(), Avros.reflects(StringWrapper.class));
+
+ PCollection<String> stringsA = stringWrappers.parallelDo(new AppendFn("A"), stringWrappers.getPType())
+ .parallelDo(new StringWrapper.StringWrapperToStringMapFn(), Writables.strings());
+ PCollection<String> stringsB = stringWrappers.parallelDo(new AppendFn("B"), stringWrappers.getPType())
+ .parallelDo(new StringWrapper.StringWrapperToStringMapFn(), Writables.strings());
+
+ String outputA = tmpDir.getFileName("stringsA");
+ String outputB = tmpDir.getFileName("stringsB");
+
+ pipeline.writeTextFile(stringsA, outputA);
+ pipeline.writeTextFile(stringsB, outputB);
+ PipelineResult pipelineResult = pipeline.done();
+
+ // Make sure fusing did actually occur
+ assertEquals(1, pipelineResult.getStageResults().size());
+
+ checkFileContents(outputA, Lists.newArrayList("cA", "dA", "aA"));
+ checkFileContents(outputB, Lists.newArrayList("cB", "dB", "aB"));
+
+ }
+
+ private void checkFileContents(String filePath, List<String> expected) throws IOException {
+ File outputFile = new File(filePath, "part-m-00000");
+ List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
+ assertEquals(expected, lines);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/PCollectionGetSizeIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/PCollectionGetSizeIT.java b/crunch-core/src/it/java/org/apache/crunch/PCollectionGetSizeIT.java
new file mode 100644
index 0000000..44eb897
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/PCollectionGetSizeIT.java
@@ -0,0 +1,151 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static com.google.common.collect.Lists.newArrayList;
+import static org.apache.crunch.io.At.sequenceFile;
+import static org.apache.crunch.io.At.textFile;
+import static org.apache.crunch.types.writable.Writables.strings;
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.assertThat;
+
+import java.io.IOException;
+
+import org.apache.crunch.fn.FilterFns;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Rule;
+import org.junit.Test;
+
+public class PCollectionGetSizeIT {
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ private String emptyInputPath;
+ private String nonEmptyInputPath;
+ private String outputPath;
+
+ @Before
+ public void setUp() throws IOException {
+ emptyInputPath = tmpDir.copyResourceFileName("emptyTextFile.txt");
+ nonEmptyInputPath = tmpDir.copyResourceFileName("set1.txt");
+ outputPath = tmpDir.getFileName("output");
+ }
+
+ @Test
+ public void testGetSizeOfEmptyInput_MRPipeline() throws IOException {
+ testCollectionGetSizeOfEmptyInput(new MRPipeline(this.getClass(), tmpDir.getDefaultConfiguration()));
+ }
+
+ @Test
+ public void testGetSizeOfEmptyInput_MemPipeline() throws IOException {
+ testCollectionGetSizeOfEmptyInput(MemPipeline.getInstance());
+ }
+
+ private void testCollectionGetSizeOfEmptyInput(Pipeline pipeline) throws IOException {
+
+ assertThat(pipeline.read(textFile(emptyInputPath)).getSize(), is(0L));
+ }
+
+ @Test
+ public void testMaterializeEmptyInput_MRPipeline() throws IOException {
+ testMaterializeEmptyInput(new MRPipeline(this.getClass(), tmpDir.getDefaultConfiguration()));
+ }
+
+ @Test
+ public void testMaterializeEmptyImput_MemPipeline() throws IOException {
+ testMaterializeEmptyInput(MemPipeline.getInstance());
+ }
+
+ private void testMaterializeEmptyInput(Pipeline pipeline) throws IOException {
+ assertThat(newArrayList(pipeline.readTextFile(emptyInputPath).materialize().iterator()).size(), is(0));
+ }
+
+ @Test
+ public void testGetSizeOfEmptyIntermediatePCollection_MRPipeline() throws IOException {
+
+ PCollection<String> emptyIntermediate = createPesistentEmptyIntermediate(
+ new MRPipeline(this.getClass(), tmpDir.getDefaultConfiguration()));
+
+ assertThat(emptyIntermediate.getSize(), is(0L));
+ }
+
+ @Test
+ @Ignore("GetSize of a DoCollection is only an estimate based on scale factor, so we can't count on it being reported as 0")
+ public void testGetSizeOfEmptyIntermediatePCollection_NoSave_MRPipeline() throws IOException {
+
+ PCollection<String> data = new MRPipeline(this.getClass(), tmpDir.getDefaultConfiguration())
+ .readTextFile(nonEmptyInputPath);
+
+ PCollection<String> emptyPCollection = data.filter(FilterFns.<String>REJECT_ALL());
+
+ assertThat(emptyPCollection.getSize(), is(0L));
+ }
+
+ @Test
+ public void testGetSizeOfEmptyIntermediatePCollection_MemPipeline() {
+
+ PCollection<String> emptyIntermediate = createPesistentEmptyIntermediate(MemPipeline.getInstance());
+
+ assertThat(emptyIntermediate.getSize(), is(0L));
+ }
+
+ @Test
+ public void testMaterializeOfEmptyIntermediatePCollection_MRPipeline() throws IOException {
+
+ PCollection<String> emptyIntermediate = createPesistentEmptyIntermediate(
+ new MRPipeline(this.getClass(), tmpDir.getDefaultConfiguration()));
+
+ assertThat(newArrayList(emptyIntermediate.materialize()).size(), is(0));
+ }
+
+ @Test
+ public void testMaterializeOfEmptyIntermediatePCollection_MemPipeline() {
+
+ PCollection<String> emptyIntermediate = createPesistentEmptyIntermediate(MemPipeline.getInstance());
+
+ assertThat(newArrayList(emptyIntermediate.materialize()).size(), is(0));
+ }
+
+ private PCollection<String> createPesistentEmptyIntermediate(Pipeline pipeline) {
+
+ PCollection<String> data = pipeline.readTextFile(nonEmptyInputPath);
+
+ PCollection<String> emptyPCollection = data.filter(FilterFns.<String>REJECT_ALL());
+
+ emptyPCollection.write(sequenceFile(outputPath, strings()));
+
+ pipeline.run();
+
+ return pipeline.read(sequenceFile(outputPath, strings()));
+ }
+
+ @Test(expected = IllegalStateException.class)
+ public void testExpectExceptionForGettingSizeOfNonExistingFile_MRPipeline() throws IOException {
+ new MRPipeline(this.getClass(), tmpDir.getDefaultConfiguration()).readTextFile("non_existing.file").getSize();
+ }
+
+ @Test(expected = IllegalStateException.class)
+ public void testExpectExceptionForGettingSizeOfNonExistingFile_MemPipeline() {
+ MemPipeline.getInstance().readTextFile("non_existing.file").getSize();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java b/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java
new file mode 100644
index 0000000..6ee849f
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/PObjectsIT.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.lang.Integer;
+import java.lang.Iterable;
+import java.lang.String;
+import java.util.Iterator;
+
+import org.apache.crunch.PCollection;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.materialize.pobject.PObjectImpl;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.junit.Rule;
+import org.junit.Test;
+
+@SuppressWarnings("serial")
+public class PObjectsIT {
+
+ private static final Integer LINES_IN_SHAKES = 3667;
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ /**
+ * A mock PObject that should map PCollections of strings to an integer count of the number of
+ * elements in the underlying PCollection.
+ */
+ public static class MockPObjectImpl extends PObjectImpl<String, Integer> {
+ private int numProcessCalls;
+
+ public MockPObjectImpl(PCollection<String> collect) {
+ super(collect);
+ numProcessCalls = 0;
+ }
+
+ @Override
+ public Integer process(Iterable<String> input) {
+ numProcessCalls++;
+ int i = 0;
+ Iterator<String> itr = input.iterator();
+ while (itr.hasNext()) {
+ i++;
+ itr.next();
+ }
+ return i;
+ }
+
+ public int getNumProcessCalls() {
+ return numProcessCalls;
+ }
+ }
+
+ @Test
+ public void testMRPipeline() throws IOException {
+ run(new MRPipeline(PObjectsIT.class, tmpDir.getDefaultConfiguration()));
+ }
+
+ @Test
+ public void testInMemoryPipeline() throws IOException {
+ run(MemPipeline.getInstance());
+ }
+
+ public void run(Pipeline pipeline) throws IOException {
+ String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
+ PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
+ MockPObjectImpl lineCount = new MockPObjectImpl(shakespeare);
+ // Get the line count once and verify it's correctness.
+ assertEquals("Incorrect number of lines counted from PCollection.", LINES_IN_SHAKES,
+ lineCount.getValue());
+ // And do it again.
+ assertEquals("Incorrect number of lines counted from PCollection.", LINES_IN_SHAKES,
+ lineCount.getValue());
+ // Make sure process was called only once because the PObject's value was cached after the
+ // first call.
+ assertEquals("Process on PObject not called exactly 1 times.", 1,
+ lineCount.getNumProcessCalls());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/PTableKeyValueIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/PTableKeyValueIT.java b/crunch-core/src/it/java/org/apache/crunch/PTableKeyValueIT.java
new file mode 100644
index 0000000..d56e122
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/PTableKeyValueIT.java
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+
+import junit.framework.Assert;
+
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.At;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import com.google.common.collect.Lists;
+
+@RunWith(value = Parameterized.class)
+public class PTableKeyValueIT implements Serializable {
+
+ private static final long serialVersionUID = 4374227704751746689L;
+
+ private transient PTypeFamily typeFamily;
+ private transient MRPipeline pipeline;
+ private transient String inputFile;
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Before
+ public void setUp() throws IOException {
+ pipeline = new MRPipeline(PTableKeyValueIT.class, tmpDir.getDefaultConfiguration());
+ inputFile = tmpDir.copyResourceFileName("set1.txt");
+ }
+
+ @After
+ public void tearDown() {
+ pipeline.done();
+ }
+
+ public PTableKeyValueIT(PTypeFamily typeFamily) {
+ this.typeFamily = typeFamily;
+ }
+
+ @Parameters
+ public static Collection<Object[]> data() {
+ Object[][] data = new Object[][] { { WritableTypeFamily.getInstance() }, { AvroTypeFamily.getInstance() } };
+ return Arrays.asList(data);
+ }
+
+ @Test
+ public void testKeysAndValues() throws Exception {
+
+ PCollection<String> collection = pipeline.read(At.textFile(inputFile, typeFamily.strings()));
+
+ PTable<String, String> table = collection.parallelDo(new DoFn<String, Pair<String, String>>() {
+
+ @Override
+ public void process(String input, Emitter<Pair<String, String>> emitter) {
+ emitter.emit(Pair.of(input.toUpperCase(), input));
+
+ }
+ }, typeFamily.tableOf(typeFamily.strings(), typeFamily.strings()));
+
+ PCollection<String> keys = table.keys();
+ PCollection<String> values = table.values();
+
+ ArrayList<String> keyList = Lists.newArrayList(keys.materialize().iterator());
+ ArrayList<String> valueList = Lists.newArrayList(values.materialize().iterator());
+
+ Assert.assertEquals(keyList.size(), valueList.size());
+ for (int i = 0; i < keyList.size(); i++) {
+ Assert.assertEquals(keyList.get(i), valueList.get(i).toUpperCase());
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/PageRankIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/PageRankIT.java b/crunch-core/src/it/java/org/apache/crunch/PageRankIT.java
new file mode 100644
index 0000000..6291ef8
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/PageRankIT.java
@@ -0,0 +1,168 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.lib.Aggregate;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.PTypes;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+
+public class PageRankIT {
+
+ public static class PageRankData {
+ public float score;
+ public float lastScore;
+ public List<String> urls;
+
+ public PageRankData() {
+ }
+
+ public PageRankData(float score, float lastScore, Iterable<String> urls) {
+ this.score = score;
+ this.lastScore = lastScore;
+ this.urls = Lists.newArrayList(urls);
+ }
+
+ public PageRankData next(float newScore) {
+ return new PageRankData(newScore, score, urls);
+ }
+
+ public float propagatedScore() {
+ return score / urls.size();
+ }
+
+ @Override
+ public String toString() {
+ return score + " " + lastScore + " " + urls;
+ }
+ }
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testAvroReflect() throws Exception {
+ PTypeFamily tf = AvroTypeFamily.getInstance();
+ PType<PageRankData> prType = Avros.reflects(PageRankData.class);
+ String urlInput = tmpDir.copyResourceFileName("urls.txt");
+ run(new MRPipeline(PageRankIT.class, tmpDir.getDefaultConfiguration()),
+ urlInput, prType, tf);
+ }
+
+ @Test
+ public void testAvroMReflectInMemory() throws Exception {
+ PTypeFamily tf = AvroTypeFamily.getInstance();
+ PType<PageRankData> prType = Avros.reflects(PageRankData.class);
+ String urlInput = tmpDir.copyResourceFileName("urls.txt");
+ run(MemPipeline.getInstance(), urlInput, prType, tf);
+ }
+
+ @Test
+ public void testAvroJSON() throws Exception {
+ PTypeFamily tf = AvroTypeFamily.getInstance();
+ PType<PageRankData> prType = PTypes.jsonString(PageRankData.class, tf);
+ String urlInput = tmpDir.copyResourceFileName("urls.txt");
+ run(new MRPipeline(PageRankIT.class, tmpDir.getDefaultConfiguration()),
+ urlInput, prType, tf);
+ }
+
+ @Test
+ public void testWritablesJSON() throws Exception {
+ PTypeFamily tf = WritableTypeFamily.getInstance();
+ PType<PageRankData> prType = PTypes.jsonString(PageRankData.class, tf);
+ String urlInput = tmpDir.copyResourceFileName("urls.txt");
+ run(new MRPipeline(PageRankIT.class, tmpDir.getDefaultConfiguration()),
+ urlInput, prType, tf);
+ }
+
+ public static PTable<String, PageRankData> pageRank(PTable<String, PageRankData> input, final float d) {
+ PTypeFamily ptf = input.getTypeFamily();
+ PTable<String, Float> outbound = input.parallelDo(new DoFn<Pair<String, PageRankData>, Pair<String, Float>>() {
+ @Override
+ public void process(Pair<String, PageRankData> input, Emitter<Pair<String, Float>> emitter) {
+ PageRankData prd = input.second();
+ for (String link : prd.urls) {
+ emitter.emit(Pair.of(link, prd.propagatedScore()));
+ }
+ }
+ }, ptf.tableOf(ptf.strings(), ptf.floats()));
+
+ return input.cogroup(outbound).parallelDo(
+ new MapFn<Pair<String, Pair<Collection<PageRankData>, Collection<Float>>>, Pair<String, PageRankData>>() {
+ @Override
+ public Pair<String, PageRankData> map(Pair<String, Pair<Collection<PageRankData>, Collection<Float>>> input) {
+ PageRankData prd = Iterables.getOnlyElement(input.second().first());
+ Collection<Float> propagatedScores = input.second().second();
+ float sum = 0.0f;
+ for (Float s : propagatedScores) {
+ sum += s;
+ }
+ return Pair.of(input.first(), prd.next(d + (1.0f - d) * sum));
+ }
+ }, input.getPTableType());
+ }
+
+ public static void run(Pipeline pipeline, String urlInput,
+ PType<PageRankData> prType, PTypeFamily ptf) throws Exception {
+ PTable<String, PageRankData> scores = pipeline.readTextFile(urlInput)
+ .parallelDo(new MapFn<String, Pair<String, String>>() {
+ @Override
+ public Pair<String, String> map(String input) {
+ String[] urls = input.split("\\t");
+ return Pair.of(urls[0], urls[1]);
+ }
+ }, ptf.tableOf(ptf.strings(), ptf.strings())).groupByKey()
+ .parallelDo(new MapFn<Pair<String, Iterable<String>>, Pair<String, PageRankData>>() {
+ @Override
+ public Pair<String, PageRankData> map(Pair<String, Iterable<String>> input) {
+ return Pair.of(input.first(), new PageRankData(1.0f, 0.0f, input.second()));
+ }
+ }, ptf.tableOf(ptf.strings(), prType));
+
+ Float delta = 1.0f;
+ while (delta > 0.01) {
+ scores = pageRank(scores, 0.5f);
+ scores.materialize().iterator(); // force the write
+ delta = Aggregate.max(scores.parallelDo(new MapFn<Pair<String, PageRankData>, Float>() {
+ @Override
+ public Float map(Pair<String, PageRankData> input) {
+ PageRankData prd = input.second();
+ return Math.abs(prd.score - prd.lastScore);
+ }
+ }, ptf.floats())).getValue();
+ }
+ assertEquals(0.0048, delta, 0.001);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java b/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java
new file mode 100644
index 0000000..19fc302
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/StageResultsCountersIT.java
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertTrue;
+
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.crunch.PipelineResult.StageResult;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.From;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.apache.hadoop.mapreduce.Counter;
+import org.junit.After;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+
+public class StageResultsCountersIT {
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ public static HashSet<String> SPECIAL_KEYWORDS = Sets.newHashSet("AND", "OR", "NOT");
+
+ public static String KEYWORDS_COUNTER_GROUP = "KEYWORDS_COUNTER_GROUP";
+
+ @After
+ public void after() {
+ MemPipeline.clearCounters();
+ }
+
+ @Test
+ public void testStageResultsCountersMRWritables() throws Exception {
+ testSpecialKeywordCount(new MRPipeline(StageResultsCountersIT.class, tmpDir.getDefaultConfiguration()),
+ WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testStageResultsCountersMRAvro() throws Exception {
+ testSpecialKeywordCount(new MRPipeline(StageResultsCountersIT.class, tmpDir.getDefaultConfiguration()),
+ AvroTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testStageResultsCountersMemWritables() throws Exception {
+ testSpecialKeywordCount(MemPipeline.getInstance(), WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testStageResultsCountersMemAvro() throws Exception {
+ testSpecialKeywordCount(MemPipeline.getInstance(), AvroTypeFamily.getInstance());
+ }
+
+ public void testSpecialKeywordCount(Pipeline pipeline, PTypeFamily tf) throws Exception {
+
+ String rowsInputPath = tmpDir.copyResourceFileName("shakes.txt");
+
+ PipelineResult result = coutSpecialKeywords(pipeline, rowsInputPath, tf);
+
+ assertTrue(result.succeeded());
+
+ Map<String, Long> keywordsMap = countersToMap(result.getStageResults(), KEYWORDS_COUNTER_GROUP);
+
+ assertEquals(3, keywordsMap.size());
+
+ assertEquals("{NOT=157, AND=596, OR=81}", keywordsMap.toString());
+ }
+
+ private static PipelineResult coutSpecialKeywords(Pipeline pipeline, String inputFileName, PTypeFamily tf) {
+
+ pipeline.read(From.textFile(inputFileName)).parallelDo(new DoFn<String, Void>() {
+
+ @Override
+ public void process(String text, Emitter<Void> emitter) {
+
+ if (!StringUtils.isBlank(text)) {
+
+ String[] tokens = text.toUpperCase().split("\\s");
+
+ for (String token : tokens) {
+ if (SPECIAL_KEYWORDS.contains(token)) {
+ getCounter(KEYWORDS_COUNTER_GROUP, token).increment(1);
+ }
+ }
+ }
+ }
+ }, tf.nulls()).materialize(); // TODO can we avoid the materialize ?
+
+ return pipeline.done();
+ }
+
+ private static Map<String, Long> countersToMap(List<StageResult> stages, String counterGroupName) {
+
+ Map<String, Long> countersMap = Maps.newHashMap();
+
+ for (StageResult sr : stages) {
+ Iterator<Counter> iterator = sr.getCounters().getGroup(counterGroupName).iterator();
+ while (iterator.hasNext()) {
+ Counter counter = (Counter) iterator.next();
+ countersMap.put(counter.getDisplayName(), counter.getValue());
+ }
+ }
+
+ return countersMap;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/TermFrequencyIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/TermFrequencyIT.java b/crunch-core/src/it/java/org/apache/crunch/TermFrequencyIT.java
new file mode 100644
index 0000000..ca66aa8
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/TermFrequencyIT.java
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.At;
+import org.apache.crunch.io.ReadableSourceTarget;
+import org.apache.crunch.lib.Aggregate;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.Rule;
+import org.junit.Test;
+
+@SuppressWarnings("serial")
+public class TermFrequencyIT implements Serializable {
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testTermFrequencyWithNoTransform() throws IOException {
+ run(new MRPipeline(TermFrequencyIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), false);
+ }
+
+ @Test
+ public void testTermFrequencyWithTransform() throws IOException {
+ run(new MRPipeline(TermFrequencyIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), true);
+ }
+
+ @Test
+ public void testTermFrequencyNoTransformInMemory() throws IOException {
+ run(MemPipeline.getInstance(), WritableTypeFamily.getInstance(), false);
+ }
+
+ @Test
+ public void testTermFrequencyWithTransformInMemory() throws IOException {
+ run(MemPipeline.getInstance(), WritableTypeFamily.getInstance(), true);
+ }
+
+ public void run(Pipeline pipeline, PTypeFamily typeFamily, boolean transformTF) throws IOException {
+ String input = tmpDir.copyResourceFileName("docs.txt");
+
+ File transformedOutput = tmpDir.getFile("transformed-output");
+ File tfOutput = tmpDir.getFile("tf-output");
+
+ PCollection<String> docs = pipeline.readTextFile(input);
+
+ PTypeFamily ptf = docs.getTypeFamily();
+
+ /*
+ * Input: String Input title text
+ *
+ * Output: PTable<Pair<String, String>, Long> Pair<Pair<word, title>, count
+ * in title>
+ */
+ PTable<Pair<String, String>, Long> tf = Aggregate.count(docs.parallelDo("term document frequency",
+ new DoFn<String, Pair<String, String>>() {
+ @Override
+ public void process(String doc, Emitter<Pair<String, String>> emitter) {
+ String[] kv = doc.split("\t");
+ String title = kv[0];
+ String text = kv[1];
+ for (String word : text.split("\\W+")) {
+ if (word.length() > 0) {
+ Pair<String, String> pair = Pair.of(word.toLowerCase(), title);
+ emitter.emit(pair);
+ }
+ }
+ }
+ }, ptf.pairs(ptf.strings(), ptf.strings())));
+
+ if (transformTF) {
+ /*
+ * Input: Pair<Pair<String, String>, Long> Pair<Pair<word, title>, count
+ * in title>
+ *
+ * Output: PTable<String, Pair<String, Long>> PTable<word, Pair<title,
+ * count in title>>
+ */
+ PTable<String, Pair<String, Long>> wordDocumentCountPair = tf.parallelDo("transform wordDocumentPairCount",
+ new MapFn<Pair<Pair<String, String>, Long>, Pair<String, Pair<String, Long>>>() {
+ @Override
+ public Pair<String, Pair<String, Long>> map(Pair<Pair<String, String>, Long> input) {
+ Pair<String, String> wordDocumentPair = input.first();
+ return Pair.of(wordDocumentPair.first(), Pair.of(wordDocumentPair.second(), input.second()));
+ }
+ }, ptf.tableOf(ptf.strings(), ptf.pairs(ptf.strings(), ptf.longs())));
+
+ pipeline.writeTextFile(wordDocumentCountPair, transformedOutput.getAbsolutePath());
+ }
+
+ SourceTarget<String> st = At.textFile(tfOutput.getAbsolutePath());
+ pipeline.write(tf, st);
+
+ pipeline.run();
+
+ // test the case we should see
+ Iterable<String> lines = ((ReadableSourceTarget<String>) st).read(pipeline.getConfiguration());
+ boolean passed = false;
+ for (String line : lines) {
+ if ("[well,A]\t0".equals(line)) {
+ fail("Found " + line + " but well is in Document A 1 time");
+ }
+ if ("[well,A]\t1".equals(line)) {
+ passed = true;
+ }
+ }
+ assertTrue(passed);
+ pipeline.done();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/TextPairIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/TextPairIT.java b/crunch-core/src/it/java/org/apache/crunch/TextPairIT.java
new file mode 100644
index 0000000..55d9af9
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/TextPairIT.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.From;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.writable.Writables;
+import org.junit.Rule;
+import org.junit.Test;
+
+public class TextPairIT {
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testWritables() throws IOException {
+ run(new MRPipeline(TextPairIT.class, tmpDir.getDefaultConfiguration()));
+ }
+
+ private static final String CANARY = "Writables.STRING_TO_TEXT";
+
+ public static PCollection<Pair<String, String>> wordDuplicate(PCollection<String> words) {
+ return words.parallelDo("my word duplicator", new DoFn<String, Pair<String, String>>() {
+ public void process(String line, Emitter<Pair<String, String>> emitter) {
+ for (String word : line.split("\\W+")) {
+ if (word.length() > 0) {
+ Pair<String, String> pair = Pair.of(CANARY, word);
+ emitter.emit(pair);
+ }
+ }
+ }
+ }, Writables.pairs(Writables.strings(), Writables.strings()));
+ }
+
+ public void run(Pipeline pipeline) throws IOException {
+ String input = tmpDir.copyResourceFileName("shakes.txt");
+
+ PCollection<String> shakespeare = pipeline.read(From.textFile(input));
+ Iterable<Pair<String, String>> lines = pipeline.materialize(wordDuplicate(shakespeare));
+ boolean passed = false;
+ for (Pair<String, String> line : lines) {
+ if (line.first().contains(CANARY)) {
+ passed = true;
+ break;
+ }
+ }
+
+ pipeline.done();
+ assertTrue(passed);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/TfIdfIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/TfIdfIT.java b/crunch-core/src/it/java/org/apache/crunch/TfIdfIT.java
new file mode 100644
index 0000000..218f538
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/TfIdfIT.java
@@ -0,0 +1,224 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+import java.nio.charset.Charset;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.crunch.fn.MapKeysFn;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.seq.SeqFileSourceTarget;
+import org.apache.crunch.lib.Aggregate;
+import org.apache.crunch.lib.Join;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.apache.hadoop.fs.Path;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+import com.google.common.io.Files;
+
+@SuppressWarnings("serial")
+public class TfIdfIT implements Serializable {
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+ // total number of documents, should calculate
+ protected static final double N = 2;
+
+ @Test
+ public void testWritablesSingleRun() throws IOException {
+ run(new MRPipeline(TfIdfIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), true);
+ }
+
+ @Test
+ public void testWritablesMultiRun() throws IOException {
+ run(new MRPipeline(TfIdfIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance(), false);
+ }
+
+ /**
+ * This method should generate a TF-IDF score for the input.
+ */
+ public PTable<String, Collection<Pair<String, Double>>> generateTFIDF(PCollection<String> docs, Path termFreqPath,
+ PTypeFamily ptf) throws IOException {
+
+ /*
+ * Input: String Input title text
+ *
+ * Output: PTable<Pair<String, String>, Long> Pair<Pair<word, title>, count
+ * in title>
+ */
+ PTable<Pair<String, String>, Long> tf = Aggregate.count(docs.parallelDo("term document frequency",
+ new DoFn<String, Pair<String, String>>() {
+ @Override
+ public void process(String doc, Emitter<Pair<String, String>> emitter) {
+ String[] kv = doc.split("\t");
+ String title = kv[0];
+ String text = kv[1];
+ for (String word : text.split("\\W+")) {
+ if (word.length() > 0) {
+ Pair<String, String> pair = Pair.of(word.toLowerCase(), title);
+ emitter.emit(pair);
+ }
+ }
+ }
+ }, ptf.pairs(ptf.strings(), ptf.strings())));
+
+ tf.write(new SeqFileSourceTarget<Pair<Pair<String, String>, Long>>(termFreqPath, tf.getPType()));
+
+ /*
+ * Input: Pair<Pair<String, String>, Long> Pair<Pair<word, title>, count in
+ * title>
+ *
+ * Output: PTable<String, Long> PTable<word, # of docs containing word>
+ */
+ PTable<String, Long> n = Aggregate.count(tf.parallelDo("little n (# of docs contain word)",
+ new DoFn<Pair<Pair<String, String>, Long>, String>() {
+ @Override
+ public void process(Pair<Pair<String, String>, Long> input, Emitter<String> emitter) {
+ emitter.emit(input.first().first());
+ }
+ }, ptf.strings()));
+
+ /*
+ * Input: Pair<Pair<String, String>, Long> Pair<Pair<word, title>, count in
+ * title>
+ *
+ * Output: PTable<String, Pair<String, Long>> PTable<word, Pair<title, count
+ * in title>>
+ */
+ PTable<String, Collection<Pair<String, Long>>> wordDocumentCountPair = tf.parallelDo(
+ "transform wordDocumentPairCount",
+ new DoFn<Pair<Pair<String, String>, Long>, Pair<String, Collection<Pair<String, Long>>>>() {
+ Collection<Pair<String, Long>> buffer;
+ String key;
+
+ @Override
+ public void process(Pair<Pair<String, String>, Long> input,
+ Emitter<Pair<String, Collection<Pair<String, Long>>>> emitter) {
+ Pair<String, String> wordDocumentPair = input.first();
+ if (!wordDocumentPair.first().equals(key)) {
+ flush(emitter);
+ key = wordDocumentPair.first();
+ buffer = Lists.newArrayList();
+ }
+ buffer.add(Pair.of(wordDocumentPair.second(), input.second()));
+ }
+
+ protected void flush(Emitter<Pair<String, Collection<Pair<String, Long>>>> emitter) {
+ if (buffer != null) {
+ emitter.emit(Pair.of(key, buffer));
+ buffer = null;
+ }
+ }
+
+ @Override
+ public void cleanup(Emitter<Pair<String, Collection<Pair<String, Long>>>> emitter) {
+ flush(emitter);
+ }
+ }, ptf.tableOf(ptf.strings(), ptf.collections(ptf.pairs(ptf.strings(), ptf.longs()))));
+
+ PTable<String, Pair<Long, Collection<Pair<String, Long>>>> joinedResults = Join.join(n, wordDocumentCountPair);
+
+ /*
+ * Input: Pair<String, Pair<Long, Collection<Pair<String, Long>>> Pair<word,
+ * Pair<# of docs containing word, Collection<Pair<title, term frequency>>>
+ *
+ * Output: Pair<String, Collection<Pair<String, Double>>> Pair<word,
+ * Collection<Pair<title, tfidf>>>
+ */
+ return joinedResults
+ .parallelDo(
+ "calculate tfidf",
+ new MapFn<Pair<String, Pair<Long, Collection<Pair<String, Long>>>>, Pair<String, Collection<Pair<String, Double>>>>() {
+ @Override
+ public Pair<String, Collection<Pair<String, Double>>> map(
+ Pair<String, Pair<Long, Collection<Pair<String, Long>>>> input) {
+ Collection<Pair<String, Double>> tfidfs = Lists.newArrayList();
+ String word = input.first();
+ double n = input.second().first();
+ double idf = Math.log(N / n);
+ for (Pair<String, Long> tf : input.second().second()) {
+ double tfidf = tf.second() * idf;
+ tfidfs.add(Pair.of(tf.first(), tfidf));
+ }
+ return Pair.of(word, tfidfs);
+ }
+
+ }, ptf.tableOf(ptf.strings(), ptf.collections(ptf.pairs(ptf.strings(), ptf.doubles()))));
+ }
+
+ public void run(Pipeline pipeline, PTypeFamily typeFamily, boolean singleRun) throws IOException {
+ String inputFile = tmpDir.copyResourceFileName("docs.txt");
+ String outputPath1 = tmpDir.getFileName("output1");
+ String outputPath2 = tmpDir.getFileName("output2");
+
+ Path tfPath = tmpDir.getPath("termfreq");
+
+ PCollection<String> docs = pipeline.readTextFile(inputFile);
+
+ PTable<String, Collection<Pair<String, Double>>> results = generateTFIDF(docs, tfPath, typeFamily);
+ pipeline.writeTextFile(results, outputPath1);
+ if (!singleRun) {
+ pipeline.run();
+ }
+
+ PTable<String, Collection<Pair<String, Double>>> uppercased = results.parallelDo(
+ new MapKeysFn<String, String, Collection<Pair<String, Double>>>() {
+ @Override
+ public String map(String k1) {
+ return k1.toUpperCase();
+ }
+ }, results.getPTableType());
+ pipeline.writeTextFile(uppercased, outputPath2);
+ pipeline.done();
+
+ // Check the lowercase version...
+ File outputFile = new File(outputPath1, "part-r-00000");
+ List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
+ boolean passed = false;
+ for (String line : lines) {
+ if (line.startsWith("[the") && line.contains("B,0.6931471805599453")) {
+ passed = true;
+ break;
+ }
+ }
+ assertTrue(passed);
+
+ // ...and the uppercase version
+ outputFile = new File(outputPath2, "part-r-00000");
+ lines = Files.readLines(outputFile, Charset.defaultCharset());
+ passed = false;
+ for (String line : lines) {
+ if (line.startsWith("[THE") && line.contains("B,0.6931471805599453")) {
+ passed = true;
+ break;
+ }
+ }
+ assertTrue(passed);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/TupleNClassCastBugIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/TupleNClassCastBugIT.java b/crunch-core/src/it/java/org/apache/crunch/TupleNClassCastBugIT.java
new file mode 100644
index 0000000..e49f4d5
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/TupleNClassCastBugIT.java
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.List;
+
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.io.Files;
+
+
+public class TupleNClassCastBugIT {
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ public static PCollection<TupleN> mapGroupDo(PCollection<String> lines, PTypeFamily ptf) {
+ PTable<String, TupleN> mapped = lines.parallelDo(new MapFn<String, Pair<String, TupleN>>() {
+
+ @Override
+ public Pair<String, TupleN> map(String line) {
+ String[] columns = line.split("\\t");
+ String docId = columns[0];
+ String docLine = columns[1];
+ return Pair.of(docId, new TupleN(docId, docLine));
+ }
+ }, ptf.tableOf(ptf.strings(), ptf.tuples(ptf.strings(), ptf.strings())));
+ return mapped.groupByKey().parallelDo(new DoFn<Pair<String, Iterable<TupleN>>, TupleN>() {
+ @Override
+ public void process(Pair<String, Iterable<TupleN>> input, Emitter<TupleN> tupleNEmitter) {
+ for (TupleN tuple : input.second()) {
+ tupleNEmitter.emit(tuple);
+ }
+ }
+ }, ptf.tuples(ptf.strings(), ptf.strings()));
+ }
+
+ @Test
+ public void testWritables() throws IOException {
+ run(new MRPipeline(TupleNClassCastBugIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testAvro() throws IOException {
+ run(new MRPipeline(TupleNClassCastBugIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
+ }
+
+ public void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
+ String inputPath = tmpDir.copyResourceFileName("docs.txt");
+ String outputPath = tmpDir.getFileName("output");
+
+ PCollection<String> docLines = pipeline.readTextFile(inputPath);
+ pipeline.writeTextFile(mapGroupDo(docLines, typeFamily), outputPath);
+ pipeline.done();
+
+ // *** We are not directly testing the output, we are looking for a
+ // ClassCastException
+ // *** which is thrown in a different thread during the reduce phase. If all
+ // is well
+ // *** the file will exist and have six lines. Otherwise the bug is present.
+ File outputFile = new File(outputPath, "part-r-00000");
+ List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
+ int lineCount = 0;
+ for (String line : lines) {
+ lineCount++;
+ }
+ assertEquals(6, lineCount);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/UnionFromSameSourceIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/UnionFromSameSourceIT.java b/crunch-core/src/it/java/org/apache/crunch/UnionFromSameSourceIT.java
new file mode 100644
index 0000000..501a944
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/UnionFromSameSourceIT.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+
+import org.apache.crunch.fn.IdentityFn;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.writable.Writables;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+
+/**
+ * Collection of tests re-using the same PCollection in various unions.
+ */
+public class UnionFromSameSourceIT {
+
+ private static final int NUM_ELEMENTS = 4;
+
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+ private Pipeline pipeline;
+ private PType<String> elementType = Writables.strings();
+ private PTableType<String, String> tableType = Writables.tableOf(Writables.strings(),
+ Writables.strings());
+
+ @Before
+ public void setUp() {
+ pipeline = new MRPipeline(UnionFromSameSourceIT.class, tmpDir.getDefaultConfiguration());
+ }
+
+ @Test
+ public void testUnion_SingleRead() throws IOException {
+ PCollection<String> strings = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt"));
+ PCollection<String> union = strings.union(strings.parallelDo(IdentityFn.<String> getInstance(),
+ strings.getPType()));
+
+ assertEquals(NUM_ELEMENTS * 2, getCount(union));
+ }
+
+ @Test
+ public void testUnion_TwoReads() throws IOException {
+ PCollection<String> stringsA = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt"));
+ PCollection<String> stringsB = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt"));
+
+ PCollection<String> union = stringsA.union(stringsB);
+
+ assertEquals(NUM_ELEMENTS * 2, getCount(union));
+ }
+
+ @Test
+ public void testDoubleUnion_EndingWithGBK() throws IOException {
+ runDoubleUnionPipeline(true);
+ }
+
+ @Test
+ public void testDoubleUnion_EndingWithoutGBK() throws IOException {
+ runDoubleUnionPipeline(false);
+ }
+
+ private void runDoubleUnionPipeline(boolean endWithGBK) throws IOException {
+ PCollection<String> strings = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt"));
+ PTable<String, String> tableA = strings.parallelDo("to table A", new ToTableFn(), tableType);
+ PTable<String, String> tableB = strings.parallelDo("to table B", new ToTableFn(), tableType);
+
+ PGroupedTable<String, String> groupedTable = tableA.union(tableB).groupByKey();
+ PCollection<String> ungrouped = groupedTable.parallelDo("ungroup before union",
+ new FromGroupedTableFn(), elementType).union(
+ strings.parallelDo("fake id", IdentityFn.<String> getInstance(), elementType));
+
+ PTable<String, String> table = ungrouped.parallelDo("union back to table", new ToTableFn(),
+ tableType);
+
+ if (endWithGBK) {
+ table = table.groupByKey().ungroup();
+ }
+
+ assertEquals(3 * NUM_ELEMENTS, getCount(table));
+ }
+
+ private int getCount(PCollection<?> pcollection) {
+ int cnt = 0;
+ for (Object v : pcollection.materialize()) {
+ cnt++;
+ }
+ return cnt;
+ }
+
+ private static class ToTableFn extends MapFn<String, Pair<String, String>> {
+
+ @Override
+ public Pair<String, String> map(String input) {
+ return Pair.of(input, input);
+ }
+
+ }
+
+ private static class FromGroupedTableFn extends DoFn<Pair<String, Iterable<String>>, String> {
+
+ @Override
+ public void process(Pair<String, Iterable<String>> input, Emitter<String> emitter) {
+ for (String value : input.second()) {
+ emitter.emit(value);
+ }
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/UnionIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/UnionIT.java b/crunch-core/src/it/java/org/apache/crunch/UnionIT.java
new file mode 100644
index 0000000..1c60a1b
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/UnionIT.java
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.assertThat;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.crunch.fn.Aggregators;
+import org.apache.crunch.fn.IdentityFn;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.test.Tests;
+import org.apache.crunch.types.avro.Avros;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.ImmutableMultiset;
+
+
+public class UnionIT {
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+ private MRPipeline pipeline;
+ private PCollection<String> words1;
+ private PCollection<String> words2;
+
+ @Before
+ public void setUp() throws IOException {
+ pipeline = new MRPipeline(UnionIT.class, tmpDir.getDefaultConfiguration());
+ words1 = pipeline.readTextFile(tmpDir.copyResourceFileName(Tests.resource(this, "src1.txt")));
+ words2 = pipeline.readTextFile(tmpDir.copyResourceFileName(Tests.resource(this, "src2.txt")));
+ }
+
+ @After
+ public void tearDown() {
+ pipeline.done();
+ }
+
+ @Test
+ public void testUnion() throws Exception {
+ IdentityFn<String> identity = IdentityFn.getInstance();
+ words1 = words1.parallelDo(identity, Avros.strings());
+ words2 = words2.parallelDo(identity, Avros.strings());
+
+ PCollection<String> union = words1.union(words2);
+
+ ImmutableMultiset<String> actual = ImmutableMultiset.copyOf(union.materialize());
+ assertThat(actual.elementSet().size(), is(3));
+ assertThat(actual.count("a1"), is(4));
+ assertThat(actual.count("b2"), is(2));
+ assertThat(actual.count("c3"), is(2));
+ }
+
+ @Test
+ public void testTableUnion() throws IOException {
+ PTable<String, String> words1ByFirstLetter = byFirstLetter(words1);
+ PTable<String, String> words2ByFirstLetter = byFirstLetter(words2);
+
+ PTable<String, String> union = words1ByFirstLetter.union(words2ByFirstLetter);
+
+ ImmutableMultiset<Pair<String, String>> actual = ImmutableMultiset.copyOf(union.materialize());
+
+ assertThat(actual.elementSet().size(), is(3));
+ assertThat(actual.count(Pair.of("a", "1")), is(4));
+ assertThat(actual.count(Pair.of("b", "2")), is(2));
+ assertThat(actual.count(Pair.of("c", "3")), is(2));
+ }
+
+ @Test
+ public void testUnionThenGroupByKey() throws IOException {
+ PCollection<String> union = words1.union(words2);
+
+ PGroupedTable<String, String> grouped = byFirstLetter(union).groupByKey();
+
+ Map<String, String> actual = grouped.combineValues(Aggregators.STRING_CONCAT("", true))
+ .materializeToMap();
+
+ Map<String, String> expected = ImmutableMap.of("a", "1111", "b", "22", "c", "33");
+ assertThat(actual, is(expected));
+ }
+
+ @Test
+ public void testTableUnionThenGroupByKey() throws IOException {
+ PTable<String, String> words1ByFirstLetter = byFirstLetter(words1);
+ PTable<String, String> words2ByFirstLetter = byFirstLetter(words2);
+
+ PTable<String, String> union = words1ByFirstLetter.union(words2ByFirstLetter);
+
+ PGroupedTable<String, String> grouped = union.groupByKey();
+
+ Map<String, String> actual = grouped.combineValues(Aggregators.STRING_CONCAT("", true))
+ .materializeToMap();
+
+ Map<String, String> expected = ImmutableMap.of("a", "1111", "b", "22", "c", "33");
+ assertThat(actual, is(expected));
+ }
+
+
+ private static PTable<String, String> byFirstLetter(PCollection<String> values) {
+ return values.parallelDo("byFirstLetter", new FirstLetterKeyFn(),
+ Avros.tableOf(Avros.strings(), Avros.strings()));
+ }
+
+ private static class FirstLetterKeyFn extends DoFn<String, Pair<String, String>> {
+ @Override
+ public void process(String input, Emitter<Pair<String, String>> emitter) {
+ if (input.length() > 1) {
+ emitter.emit(Pair.of(input.substring(0, 1), input.substring(1)));
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/UnionResultsIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/UnionResultsIT.java b/crunch-core/src/it/java/org/apache/crunch/UnionResultsIT.java
new file mode 100644
index 0000000..df0511a
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/UnionResultsIT.java
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.At;
+import org.apache.crunch.test.CrunchTestSupport;
+import org.apache.crunch.types.writable.Writables;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+public class UnionResultsIT extends CrunchTestSupport implements Serializable {
+
+ static class StringLengthMapFn extends MapFn<String, Pair<String, Long>> {
+
+ @Override
+ public Pair<String, Long> map(String input) {
+ return new Pair<String, Long>(input, 10L);
+ }
+ }
+
+
+ /**
+ * Tests combining a GBK output with a map-only job output into a single
+ * unioned collection.
+ */
+ @Test
+ public void testUnionOfGroupedOutputAndNonGroupedOutput() throws IOException {
+ String inputPath = tempDir.copyResourceFileName("set1.txt");
+ String inputPath2 = tempDir.copyResourceFileName("set2.txt");
+
+ Pipeline pipeline = new MRPipeline(UnionResultsIT.class);
+
+ PCollection<String> set1Lines = pipeline.read(At.textFile(inputPath, Writables.strings()));
+ PCollection<Pair<String, Long>> set1Lengths = set1Lines.parallelDo(new StringLengthMapFn(),
+ Writables.pairs(Writables.strings(), Writables.longs()));
+ PCollection<Pair<String, Long>> set2Counts = pipeline.read(At.textFile(inputPath2, Writables.strings())).count();
+
+ PCollection<Pair<String, Long>> union = set1Lengths.union(set2Counts);
+
+ List<Pair<String, Long>> unionValues = Lists.newArrayList(union.materialize());
+ assertEquals(7, unionValues.size());
+
+ Set<Pair<String, Long>> expectedPairs = Sets.newHashSet();
+ expectedPairs.add(Pair.of("b", 10L));
+ expectedPairs.add(Pair.of("c", 10L));
+ expectedPairs.add(Pair.of("a", 10L));
+ expectedPairs.add(Pair.of("e", 10L));
+ expectedPairs.add(Pair.of("a", 1L));
+ expectedPairs.add(Pair.of("c", 1L));
+ expectedPairs.add(Pair.of("d", 1L));
+
+ assertEquals(expectedPairs, Sets.newHashSet(unionValues));
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java b/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java
new file mode 100644
index 0000000..c646663
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/WordCountIT.java
@@ -0,0 +1,171 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.List;
+
+import org.apache.crunch.fn.Aggregators;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.At;
+import org.apache.crunch.io.To;
+import org.apache.crunch.lib.Aggregate;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import com.google.common.io.Files;
+
+public class WordCountIT {
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ enum WordCountStats {
+ ANDS
+ };
+
+ public static PTable<String, Long> wordCount(PCollection<String> words, PTypeFamily typeFamily) {
+ return Aggregate.count(words.parallelDo(new DoFn<String, String>() {
+
+ @Override
+ public void process(String line, Emitter<String> emitter) {
+ for (String word : line.split("\\s+")) {
+ emitter.emit(word);
+ if ("and".equals(word)) {
+ increment(WordCountStats.ANDS);
+ }
+ }
+ }
+ }, typeFamily.strings()));
+ }
+
+ public static PTable<String, Long> substr(PTable<String, Long> ptable) {
+ return ptable.parallelDo(new DoFn<Pair<String, Long>, Pair<String, Long>>() {
+
+ public void process(Pair<String, Long> input, Emitter<Pair<String, Long>> emitter) {
+ if (input.first().length() > 0) {
+ emitter.emit(Pair.of(input.first().substring(0, 1), input.second()));
+ }
+ }
+ }, ptable.getPTableType());
+ }
+
+ private boolean runSecond = false;
+ private boolean useToOutput = false;
+
+ @Test
+ public void testWritables() throws IOException {
+ run(new MRPipeline(WordCountIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testWritablesWithSecond() throws IOException {
+ runSecond = true;
+ run(new MRPipeline(WordCountIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testWritablesWithSecondUseToOutput() throws IOException {
+ runSecond = true;
+ useToOutput = true;
+ run(new MRPipeline(WordCountIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testAvro() throws IOException {
+ run(new MRPipeline(WordCountIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testAvroWithSecond() throws IOException {
+ runSecond = true;
+ run(new MRPipeline(WordCountIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testWithTopWritable() throws IOException {
+ runWithTop(WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testWithTopAvro() throws IOException {
+ runWithTop(AvroTypeFamily.getInstance());
+ }
+
+ public void runWithTop(PTypeFamily tf) throws IOException {
+ Pipeline pipeline = new MRPipeline(WordCountIT.class, tmpDir.getDefaultConfiguration());
+ String inputPath = tmpDir.copyResourceFileName("shakes.txt");
+
+ PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, tf.strings()));
+ PTable<String, Long> wordCount = wordCount(shakespeare, tf);
+ List<Pair<String, Long>> top5 = Lists.newArrayList(Aggregate.top(wordCount, 5, true).materialize());
+ assertEquals(
+ ImmutableList.of(Pair.of("", 1470L), Pair.of("the", 620L), Pair.of("and", 427L), Pair.of("of", 396L),
+ Pair.of("to", 367L)), top5);
+ }
+
+ public void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
+ String inputPath = tmpDir.copyResourceFileName("shakes.txt");
+ String outputPath = tmpDir.getFileName("output");
+
+ PCollection<String> shakespeare = pipeline.read(At.textFile(inputPath, typeFamily.strings()));
+ PTable<String, Long> wordCount = wordCount(shakespeare, typeFamily);
+ if (useToOutput) {
+ wordCount.write(To.textFile(outputPath));
+ } else {
+ pipeline.writeTextFile(wordCount, outputPath);
+ }
+
+ if (runSecond) {
+ String substrPath = tmpDir.getFileName("substr");
+ PTable<String, Long> we = substr(wordCount).groupByKey().combineValues(Aggregators.SUM_LONGS());
+ pipeline.writeTextFile(we, substrPath);
+ }
+ PipelineResult res = pipeline.done();
+ assertTrue(res.succeeded());
+ List<PipelineResult.StageResult> stageResults = res.getStageResults();
+ if (runSecond) {
+ assertEquals(2, stageResults.size());
+ } else {
+ assertEquals(1, stageResults.size());
+ assertEquals(427, stageResults.get(0).getCounterValue(WordCountStats.ANDS));
+ }
+
+ File outputFile = new File(outputPath, "part-r-00000");
+ List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
+ boolean passed = false;
+ for (String line : lines) {
+ if (line.startsWith("Macbeth\t28") || line.startsWith("[Macbeth,28]")) {
+ passed = true;
+ break;
+ }
+ }
+ assertTrue(passed);
+ }
+}
[05/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/Avros.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/Avros.java b/crunch/src/main/java/org/apache/crunch/types/avro/Avros.java
deleted file mode 100644
index fc30eaf..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/Avros.java
+++ /dev/null
@@ -1,709 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-import java.util.UUID;
-
-import org.apache.avro.Schema;
-import org.apache.avro.Schema.Type;
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.avro.reflect.ReflectData;
-import org.apache.avro.specific.SpecificRecord;
-import org.apache.avro.util.Utf8;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-import org.apache.crunch.fn.CompositeMapFn;
-import org.apache.crunch.fn.IdentityFn;
-import org.apache.crunch.types.CollectionDeepCopier;
-import org.apache.crunch.types.DeepCopier;
-import org.apache.crunch.types.MapDeepCopier;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypes;
-import org.apache.crunch.types.TupleDeepCopier;
-import org.apache.crunch.types.TupleFactory;
-import org.apache.crunch.types.writable.WritableDeepCopier;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-import org.apache.hadoop.util.ReflectionUtils;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-/**
- * Defines static methods that are analogous to the methods defined in
- * {@link AvroTypeFamily} for convenient static importing.
- *
- */
-public class Avros {
-
- /**
- * Older versions of Avro (i.e., before 1.7.0) do not support schemas that are
- * composed of a mix of specific and reflection-based schemas. This bit
- * controls whether or not we allow Crunch jobs to be created that involve
- * mixing specific and reflection-based schemas and can be overridden by the
- * client developer.
- */
- public static final boolean CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS;
-
- static {
- CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS = AvroCapabilities.canDecodeSpecificSchemaWithReflectDatumReader();
- }
-
- /**
- * The instance we use for generating reflected schemas. May be modified by
- * clients (e.g., Scrunch.)
- */
- public static ReflectDataFactory REFLECT_DATA_FACTORY = new ReflectDataFactory();
-
- /**
- * The name of the configuration parameter that tracks which reflection
- * factory to use.
- */
- public static final String REFLECT_DATA_FACTORY_CLASS = "crunch.reflectdatafactory";
-
- public static void configureReflectDataFactory(Configuration conf) {
- conf.setClass(REFLECT_DATA_FACTORY_CLASS, REFLECT_DATA_FACTORY.getClass(), ReflectDataFactory.class);
- }
-
- public static ReflectDataFactory getReflectDataFactory(Configuration conf) {
- return (ReflectDataFactory) ReflectionUtils.newInstance(
- conf.getClass(REFLECT_DATA_FACTORY_CLASS, ReflectDataFactory.class), conf);
- }
-
- public static void checkCombiningSpecificAndReflectionSchemas() {
- if (!CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS) {
- throw new IllegalStateException("Crunch does not support running jobs that"
- + " contain a mixture of reflection-based and avro-generated data types."
- + " Please consider turning your reflection-based type into an avro-generated"
- + " type and using that generated type instead."
- + " If the version of Avro you are using is 1.7.0 or greater, you can enable"
- + " combined schemas by setting the Avros.CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS" + " field to 'true'.");
- }
- }
-
- public static MapFn<CharSequence, String> UTF8_TO_STRING = new MapFn<CharSequence, String>() {
- @Override
- public String map(CharSequence input) {
- return input.toString();
- }
- };
-
- public static MapFn<String, Utf8> STRING_TO_UTF8 = new MapFn<String, Utf8>() {
- @Override
- public Utf8 map(String input) {
- return new Utf8(input);
- }
- };
-
- public static MapFn<Object, ByteBuffer> BYTES_IN = new MapFn<Object, ByteBuffer>() {
- @Override
- public ByteBuffer map(Object input) {
- if (input instanceof ByteBuffer) {
- return (ByteBuffer) input;
- }
- return ByteBuffer.wrap((byte[]) input);
- }
- };
-
- private static final AvroType<String> strings = new AvroType<String>(String.class, Schema.create(Schema.Type.STRING),
- UTF8_TO_STRING, STRING_TO_UTF8, new DeepCopier.NoOpDeepCopier<String>());
- private static final AvroType<Void> nulls = create(Void.class, Schema.Type.NULL);
- private static final AvroType<Long> longs = create(Long.class, Schema.Type.LONG);
- private static final AvroType<Integer> ints = create(Integer.class, Schema.Type.INT);
- private static final AvroType<Float> floats = create(Float.class, Schema.Type.FLOAT);
- private static final AvroType<Double> doubles = create(Double.class, Schema.Type.DOUBLE);
- private static final AvroType<Boolean> booleans = create(Boolean.class, Schema.Type.BOOLEAN);
- private static final AvroType<ByteBuffer> bytes = new AvroType<ByteBuffer>(ByteBuffer.class,
- Schema.create(Schema.Type.BYTES), BYTES_IN, IdentityFn.getInstance(), new DeepCopier.NoOpDeepCopier<ByteBuffer>());
-
- private static final Map<Class<?>, PType<?>> PRIMITIVES = ImmutableMap.<Class<?>, PType<?>> builder()
- .put(String.class, strings).put(Long.class, longs).put(Integer.class, ints).put(Float.class, floats)
- .put(Double.class, doubles).put(Boolean.class, booleans).put(ByteBuffer.class, bytes).build();
-
- private static final Map<Class<?>, AvroType<?>> EXTENSIONS = Maps.newHashMap();
-
- public static <T> void register(Class<T> clazz, AvroType<T> ptype) {
- EXTENSIONS.put(clazz, ptype);
- }
-
- public static <T> PType<T> getPrimitiveType(Class<T> clazz) {
- return (PType<T>) PRIMITIVES.get(clazz);
- }
-
- static <T> boolean isPrimitive(AvroType<T> avroType) {
- return avroType.getTypeClass().isPrimitive() || PRIMITIVES.containsKey(avroType.getTypeClass());
- }
-
- private static <T> AvroType<T> create(Class<T> clazz, Schema.Type schemaType) {
- return new AvroType<T>(clazz, Schema.create(schemaType), new DeepCopier.NoOpDeepCopier<T>());
- }
-
- public static final AvroType<Void> nulls() {
- return nulls;
- }
-
- public static final AvroType<String> strings() {
- return strings;
- }
-
- public static final AvroType<Long> longs() {
- return longs;
- }
-
- public static final AvroType<Integer> ints() {
- return ints;
- }
-
- public static final AvroType<Float> floats() {
- return floats;
- }
-
- public static final AvroType<Double> doubles() {
- return doubles;
- }
-
- public static final AvroType<Boolean> booleans() {
- return booleans;
- }
-
- public static final AvroType<ByteBuffer> bytes() {
- return bytes;
- }
-
- public static final <T> AvroType<T> records(Class<T> clazz) {
- if (EXTENSIONS.containsKey(clazz)) {
- return (AvroType<T>) EXTENSIONS.get(clazz);
- }
- return containers(clazz);
- }
-
- public static final AvroType<GenericData.Record> generics(Schema schema) {
- return new AvroType<GenericData.Record>(GenericData.Record.class, schema, new AvroDeepCopier.AvroGenericDeepCopier(
- schema));
- }
-
- public static final <T> AvroType<T> containers(Class<T> clazz) {
- if (SpecificRecord.class.isAssignableFrom(clazz)) {
- return (AvroType<T>) specifics((Class<SpecificRecord>) clazz);
- }
- return reflects(clazz);
- }
-
- public static final <T extends SpecificRecord> AvroType<T> specifics(Class<T> clazz) {
- T t = ReflectionUtils.newInstance(clazz, null);
- Schema schema = t.getSchema();
- return new AvroType<T>(clazz, schema, new AvroDeepCopier.AvroSpecificDeepCopier<T>(clazz, schema));
- }
-
- public static final <T> AvroType<T> reflects(Class<T> clazz) {
- Schema schema = REFLECT_DATA_FACTORY.getReflectData().getSchema(clazz);
- return new AvroType<T>(clazz, schema, new AvroDeepCopier.AvroReflectDeepCopier<T>(clazz, schema));
- }
-
- private static class BytesToWritableMapFn<T extends Writable> extends MapFn<Object, T> {
- private static final Log LOG = LogFactory.getLog(BytesToWritableMapFn.class);
-
- private final Class<T> writableClazz;
-
- public BytesToWritableMapFn(Class<T> writableClazz) {
- this.writableClazz = writableClazz;
- }
-
- @Override
- public T map(Object input) {
- ByteBuffer byteBuffer = BYTES_IN.map(input);
- T instance = ReflectionUtils.newInstance(writableClazz, null);
- try {
- instance.readFields(new DataInputStream(new ByteArrayInputStream(byteBuffer.array(),
- byteBuffer.arrayOffset(), byteBuffer.limit())));
- } catch (IOException e) {
- LOG.error("Exception thrown reading instance of: " + writableClazz, e);
- }
- return instance;
- }
- }
-
- private static class WritableToBytesMapFn<T extends Writable> extends MapFn<T, ByteBuffer> {
- private static final Log LOG = LogFactory.getLog(WritableToBytesMapFn.class);
-
- @Override
- public ByteBuffer map(T input) {
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutputStream das = new DataOutputStream(baos);
- try {
- input.write(das);
- } catch (IOException e) {
- LOG.error("Exception thrown converting Writable to bytes", e);
- }
- return ByteBuffer.wrap(baos.toByteArray());
- }
- }
-
- public static final <T extends Writable> AvroType<T> writables(Class<T> clazz) {
- return new AvroType<T>(clazz, Schema.create(Schema.Type.BYTES), new BytesToWritableMapFn<T>(clazz),
- new WritableToBytesMapFn<T>(), new WritableDeepCopier<T>(clazz));
- }
-
- private static class GenericDataArrayToCollection<T> extends MapFn<Object, Collection<T>> {
-
- private final MapFn<Object, T> mapFn;
-
- public GenericDataArrayToCollection(MapFn<Object, T> mapFn) {
- this.mapFn = mapFn;
- }
-
- @Override
- public void configure(Configuration conf) {
- mapFn.configure(conf);
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- mapFn.setContext(context);
- }
-
- @Override
- public void initialize() {
- mapFn.initialize();
- }
-
- @Override
- public Collection<T> map(Object input) {
- Collection<T> ret = Lists.newArrayList();
- if (input instanceof Collection) {
- for (Object in : (Collection<Object>) input) {
- ret.add(mapFn.map(in));
- }
- } else {
- // Assume it is an array
- Object[] arr = (Object[]) input;
- for (Object in : arr) {
- ret.add(mapFn.map(in));
- }
- }
- return ret;
- }
- }
-
- private static class CollectionToGenericDataArray extends MapFn<Collection<?>, GenericData.Array<?>> {
-
- private final MapFn mapFn;
- private final String jsonSchema;
- private transient Schema schema;
-
- public CollectionToGenericDataArray(Schema schema, MapFn mapFn) {
- this.mapFn = mapFn;
- this.jsonSchema = schema.toString();
- }
-
- @Override
- public void configure(Configuration conf) {
- mapFn.configure(conf);
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- mapFn.setContext(context);
- }
-
- @Override
- public void initialize() {
- mapFn.initialize();
- }
-
- @Override
- public GenericData.Array<?> map(Collection<?> input) {
- if (schema == null) {
- schema = new Schema.Parser().parse(jsonSchema);
- }
- GenericData.Array array = new GenericData.Array(input.size(), schema);
- for (Object in : input) {
- array.add(mapFn.map(in));
- }
- return array;
- }
- }
-
- public static final <T> AvroType<Collection<T>> collections(PType<T> ptype) {
- AvroType<T> avroType = (AvroType<T>) ptype;
- Schema collectionSchema = Schema.createArray(allowNulls(avroType.getSchema()));
- GenericDataArrayToCollection<T> input = new GenericDataArrayToCollection<T>(avroType.getInputMapFn());
- CollectionToGenericDataArray output = new CollectionToGenericDataArray(collectionSchema, avroType.getOutputMapFn());
- return new AvroType(Collection.class, collectionSchema, input, output, new CollectionDeepCopier<T>(ptype), ptype);
- }
-
- private static class AvroMapToMap<T> extends MapFn<Map<CharSequence, Object>, Map<String, T>> {
- private final MapFn<Object, T> mapFn;
-
- public AvroMapToMap(MapFn<Object, T> mapFn) {
- this.mapFn = mapFn;
- }
-
- @Override
- public void configure(Configuration conf) {
- mapFn.configure(conf);
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- mapFn.setContext(context);
- }
-
- @Override
- public void initialize() {
- mapFn.initialize();
- }
-
- @Override
- public Map<String, T> map(Map<CharSequence, Object> input) {
- Map<String, T> out = Maps.newHashMap();
- for (Map.Entry<CharSequence, Object> e : input.entrySet()) {
- out.put(e.getKey().toString(), mapFn.map(e.getValue()));
- }
- return out;
- }
- }
-
- private static class MapToAvroMap<T> extends MapFn<Map<String, T>, Map<Utf8, Object>> {
- private final MapFn<T, Object> mapFn;
-
- public MapToAvroMap(MapFn<T, Object> mapFn) {
- this.mapFn = mapFn;
- }
-
- @Override
- public void configure(Configuration conf) {
- mapFn.configure(conf);
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- mapFn.setContext(context);
- }
-
- @Override
- public void initialize() {
- this.mapFn.initialize();
- }
-
- @Override
- public Map<Utf8, Object> map(Map<String, T> input) {
- Map<Utf8, Object> out = Maps.newHashMap();
- for (Map.Entry<String, T> e : input.entrySet()) {
- out.put(new Utf8(e.getKey()), mapFn.map(e.getValue()));
- }
- return out;
- }
- }
-
- public static final <T> AvroType<Map<String, T>> maps(PType<T> ptype) {
- AvroType<T> avroType = (AvroType<T>) ptype;
- Schema mapSchema = Schema.createMap(allowNulls(avroType.getSchema()));
- AvroMapToMap<T> inputFn = new AvroMapToMap<T>(avroType.getInputMapFn());
- MapToAvroMap<T> outputFn = new MapToAvroMap<T>(avroType.getOutputMapFn());
- return new AvroType(Map.class, mapSchema, inputFn, outputFn, new MapDeepCopier<T>(ptype), ptype);
- }
-
- private static class GenericRecordToTuple extends MapFn<GenericRecord, Tuple> {
- private final TupleFactory<?> tupleFactory;
- private final List<MapFn> fns;
-
- private transient Object[] values;
-
- public GenericRecordToTuple(TupleFactory<?> tupleFactory, PType<?>... ptypes) {
- this.tupleFactory = tupleFactory;
- this.fns = Lists.newArrayList();
- for (PType<?> ptype : ptypes) {
- AvroType atype = (AvroType) ptype;
- fns.add(atype.getInputMapFn());
- }
- }
-
- @Override
- public void configure(Configuration conf) {
- for (MapFn fn : fns) {
- fn.configure(conf);
- }
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- for (MapFn fn : fns) {
- fn.setContext(context);
- }
- }
-
- @Override
- public void initialize() {
- for (MapFn fn : fns) {
- fn.initialize();
- }
- this.values = new Object[fns.size()];
- tupleFactory.initialize();
- }
-
- @Override
- public Tuple map(GenericRecord input) {
- for (int i = 0; i < values.length; i++) {
- Object v = input.get(i);
- if (v == null) {
- values[i] = null;
- } else {
- values[i] = fns.get(i).map(v);
- }
- }
- return tupleFactory.makeTuple(values);
- }
- }
-
- private static class TupleToGenericRecord extends MapFn<Tuple, GenericRecord> {
- private final List<MapFn> fns;
- private final List<AvroType> avroTypes;
- private final String jsonSchema;
- private final boolean isReflect;
- private transient Schema schema;
-
- public TupleToGenericRecord(Schema schema, PType<?>... ptypes) {
- this.fns = Lists.newArrayList();
- this.avroTypes = Lists.newArrayList();
- this.jsonSchema = schema.toString();
- boolean reflectFound = false;
- boolean specificFound = false;
- for (PType ptype : ptypes) {
- AvroType atype = (AvroType) ptype;
- fns.add(atype.getOutputMapFn());
- avroTypes.add(atype);
- if (atype.hasReflect()) {
- reflectFound = true;
- }
- if (atype.hasSpecific()) {
- specificFound = true;
- }
- }
- if (specificFound && reflectFound) {
- checkCombiningSpecificAndReflectionSchemas();
- }
- this.isReflect = reflectFound;
- }
-
- @Override
- public void configure(Configuration conf) {
- for (MapFn fn : fns) {
- fn.configure(conf);
- }
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- for (MapFn fn : fns) {
- fn.setContext(getContext());
- }
- }
-
- @Override
- public void initialize() {
- this.schema = new Schema.Parser().parse(jsonSchema);
- for (MapFn fn : fns) {
- fn.initialize();
- }
- }
-
- private GenericRecord createRecord() {
- if (isReflect) {
- return new ReflectGenericRecord(schema);
- } else {
- return new GenericData.Record(schema);
- }
- }
-
- @Override
- public GenericRecord map(Tuple input) {
- GenericRecord record = createRecord();
- for (int i = 0; i < input.size(); i++) {
- Object v = input.get(i);
- if (v == null) {
- record.put(i, null);
- } else {
- record.put(i, fns.get(i).map(v));
- }
- }
- return record;
- }
- }
-
- public static final <V1, V2> AvroType<Pair<V1, V2>> pairs(PType<V1> p1, PType<V2> p2) {
- Schema schema = createTupleSchema(p1, p2);
- GenericRecordToTuple input = new GenericRecordToTuple(TupleFactory.PAIR, p1, p2);
- TupleToGenericRecord output = new TupleToGenericRecord(schema, p1, p2);
- return new AvroType(Pair.class, schema, input, output, new TupleDeepCopier(Pair.class, p1, p2), p1, p2);
- }
-
- public static final <V1, V2, V3> AvroType<Tuple3<V1, V2, V3>> triples(PType<V1> p1, PType<V2> p2, PType<V3> p3) {
- Schema schema = createTupleSchema(p1, p2, p3);
- return new AvroType(Tuple3.class, schema, new GenericRecordToTuple(TupleFactory.TUPLE3, p1, p2, p3),
- new TupleToGenericRecord(schema, p1, p2, p3), new TupleDeepCopier(Tuple3.class, p1, p2, p3), p1, p2, p3);
- }
-
- public static final <V1, V2, V3, V4> AvroType<Tuple4<V1, V2, V3, V4>> quads(PType<V1> p1, PType<V2> p2, PType<V3> p3,
- PType<V4> p4) {
- Schema schema = createTupleSchema(p1, p2, p3, p4);
- return new AvroType(Tuple4.class, schema, new GenericRecordToTuple(TupleFactory.TUPLE4, p1, p2, p3, p4),
- new TupleToGenericRecord(schema, p1, p2, p3, p4), new TupleDeepCopier(Tuple4.class, p1, p2, p3, p4), p1, p2,
- p3, p4);
- }
-
- public static final AvroType<TupleN> tuples(PType... ptypes) {
- Schema schema = createTupleSchema(ptypes);
- return new AvroType(TupleN.class, schema, new GenericRecordToTuple(TupleFactory.TUPLEN, ptypes),
- new TupleToGenericRecord(schema, ptypes), new TupleDeepCopier(TupleN.class, ptypes), ptypes);
- }
-
- public static <T extends Tuple> AvroType<T> tuples(Class<T> clazz, PType... ptypes) {
- Schema schema = createTupleSchema(ptypes);
- Class[] typeArgs = new Class[ptypes.length];
- for (int i = 0; i < typeArgs.length; i++) {
- typeArgs[i] = ptypes[i].getTypeClass();
- }
- TupleFactory<T> factory = TupleFactory.create(clazz, typeArgs);
- return new AvroType<T>(clazz, schema, new GenericRecordToTuple(factory, ptypes), new TupleToGenericRecord(schema,
- ptypes), new TupleDeepCopier(clazz, ptypes), ptypes);
- }
-
- private static Schema createTupleSchema(PType<?>... ptypes) {
- // Guarantee each tuple schema has a globally unique name
- String tupleName = "tuple" + UUID.randomUUID().toString().replace('-', 'x');
- Schema schema = Schema.createRecord(tupleName, "", "crunch", false);
- List<Schema.Field> fields = Lists.newArrayList();
- for (int i = 0; i < ptypes.length; i++) {
- AvroType atype = (AvroType) ptypes[i];
- Schema fieldSchema = allowNulls(atype.getSchema());
- fields.add(new Schema.Field("v" + i, fieldSchema, "", null));
- }
- schema.setFields(fields);
- return schema;
- }
-
- public static final <S, T> AvroType<T> derived(Class<T> clazz, MapFn<S, T> inputFn, MapFn<T, S> outputFn,
- PType<S> base) {
- AvroType<S> abase = (AvroType<S>) base;
- return new AvroType<T>(clazz, abase.getSchema(), new CompositeMapFn(abase.getInputMapFn(), inputFn),
- new CompositeMapFn(outputFn, abase.getOutputMapFn()), new DeepCopier.NoOpDeepCopier<T>(), base.getSubTypes()
- .toArray(new PType[0]));
- }
-
- public static <T> PType<T> jsons(Class<T> clazz) {
- return PTypes.jsonString(clazz, AvroTypeFamily.getInstance());
- }
-
- public static final <K, V> AvroTableType<K, V> tableOf(PType<K> key, PType<V> value) {
- if (key instanceof PTableType) {
- PTableType ptt = (PTableType) key;
- key = Avros.pairs(ptt.getKeyType(), ptt.getValueType());
- }
- if (value instanceof PTableType) {
- PTableType ptt = (PTableType) value;
- value = Avros.pairs(ptt.getKeyType(), ptt.getValueType());
- }
- AvroType<K> avroKey = (AvroType<K>) key;
- AvroType<V> avroValue = (AvroType<V>) value;
- return new AvroTableType(avroKey, avroValue, Pair.class);
- }
-
- private static final Schema NULL_SCHEMA = Schema.create(Type.NULL);
-
- private static Schema allowNulls(Schema base) {
- if (NULL_SCHEMA.equals(base)) {
- return base;
- }
- return Schema.createUnion(ImmutableList.of(base, NULL_SCHEMA));
- }
-
- private static class ReflectGenericRecord extends GenericData.Record {
-
- public ReflectGenericRecord(Schema schema) {
- super(schema);
- }
-
- @Override
- public int hashCode() {
- return reflectAwareHashCode(this, getSchema());
- }
- }
-
- /*
- * TODO: Remove this once we no longer have to support 1.5.4.
- */
- private static int reflectAwareHashCode(Object o, Schema s) {
- if (o == null)
- return 0; // incomplete datum
- int hashCode = 1;
- switch (s.getType()) {
- case RECORD:
- for (Schema.Field f : s.getFields()) {
- if (f.order() == Schema.Field.Order.IGNORE)
- continue;
- hashCode = hashCodeAdd(hashCode, ReflectData.get().getField(o, f.name(), f.pos()), f.schema());
- }
- return hashCode;
- case ARRAY:
- Collection<?> a = (Collection<?>) o;
- Schema elementType = s.getElementType();
- for (Object e : a)
- hashCode = hashCodeAdd(hashCode, e, elementType);
- return hashCode;
- case UNION:
- return reflectAwareHashCode(o, s.getTypes().get(ReflectData.get().resolveUnion(s, o)));
- case ENUM:
- return s.getEnumOrdinal(o.toString());
- case NULL:
- return 0;
- case STRING:
- return (o instanceof Utf8 ? o : new Utf8(o.toString())).hashCode();
- default:
- return o.hashCode();
- }
- }
-
- /** Add the hash code for an object into an accumulated hash code. */
- private static int hashCodeAdd(int hashCode, Object o, Schema s) {
- return 31 * hashCode + reflectAwareHashCode(o, s);
- }
-
- private Avros() {
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/ReflectDataFactory.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/ReflectDataFactory.java b/crunch/src/main/java/org/apache/crunch/types/avro/ReflectDataFactory.java
deleted file mode 100644
index e973cca..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/ReflectDataFactory.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import org.apache.avro.Schema;
-import org.apache.avro.reflect.ReflectData;
-import org.apache.avro.reflect.ReflectDatumReader;
-import org.apache.avro.reflect.ReflectDatumWriter;
-
-/**
- * A Factory class for constructing Avro reflection-related objects.
- */
-public class ReflectDataFactory {
-
- public ReflectData getReflectData() {
- return ReflectData.AllowNull.get();
- }
-
- public <T> ReflectDatumReader<T> getReader(Schema schema) {
- return new ReflectDatumReader<T>(schema);
- }
-
- public <T> ReflectDatumWriter<T> getWriter(Schema schema) {
- return new ReflectDatumWriter<T>(schema);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/SafeAvroSerialization.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/SafeAvroSerialization.java b/crunch/src/main/java/org/apache/crunch/types/avro/SafeAvroSerialization.java
deleted file mode 100644
index 8bd18b0..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/SafeAvroSerialization.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import org.apache.avro.Schema;
-import org.apache.avro.io.BinaryDecoder;
-import org.apache.avro.io.BinaryEncoder;
-import org.apache.avro.io.DatumReader;
-import org.apache.avro.io.DatumWriter;
-import org.apache.avro.io.DecoderFactory;
-import org.apache.avro.io.EncoderFactory;
-import org.apache.avro.mapred.AvroJob;
-import org.apache.avro.mapred.AvroKey;
-import org.apache.avro.mapred.AvroValue;
-import org.apache.avro.mapred.AvroWrapper;
-import org.apache.avro.mapred.Pair;
-import org.apache.avro.reflect.ReflectDatumWriter;
-import org.apache.avro.specific.SpecificDatumReader;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.serializer.Deserializer;
-import org.apache.hadoop.io.serializer.Serialization;
-import org.apache.hadoop.io.serializer.Serializer;
-import org.apache.hadoop.util.ReflectionUtils;
-
-/** The {@link Serialization} used by jobs configured with {@link AvroJob}. */
-class SafeAvroSerialization<T> extends Configured implements Serialization<AvroWrapper<T>> {
-
- public boolean accept(Class<?> c) {
- return AvroWrapper.class.isAssignableFrom(c);
- }
-
- /**
- * Returns the specified map output deserializer. Defaults to the final output
- * deserializer if no map output schema was specified.
- */
- public Deserializer<AvroWrapper<T>> getDeserializer(Class<AvroWrapper<T>> c) {
- boolean isKey = AvroKey.class.isAssignableFrom(c);
- Configuration conf = getConf();
- Schema schema = isKey ? Pair.getKeySchema(AvroJob.getMapOutputSchema(conf)) : Pair.getValueSchema(AvroJob
- .getMapOutputSchema(conf));
-
- DatumReader<T> datumReader = null;
- if (conf.getBoolean(AvroJob.MAP_OUTPUT_IS_REFLECT, false)) {
- ReflectDataFactory factory = (ReflectDataFactory) ReflectionUtils.newInstance(
- conf.getClass("crunch.reflectdatafactory", ReflectDataFactory.class), conf);
- datumReader = factory.getReader(schema);
- } else {
- datumReader = new SpecificDatumReader<T>(schema);
- }
- return new AvroWrapperDeserializer(datumReader, isKey);
- }
-
- private static final DecoderFactory FACTORY = DecoderFactory.get();
-
- private class AvroWrapperDeserializer implements Deserializer<AvroWrapper<T>> {
-
- private DatumReader<T> reader;
- private BinaryDecoder decoder;
- private boolean isKey;
-
- public AvroWrapperDeserializer(DatumReader<T> reader, boolean isKey) {
- this.reader = reader;
- this.isKey = isKey;
- }
-
- public void open(InputStream in) {
- this.decoder = FACTORY.directBinaryDecoder(in, decoder);
- }
-
- public AvroWrapper<T> deserialize(AvroWrapper<T> wrapper) throws IOException {
- T datum = reader.read(wrapper == null ? null : wrapper.datum(), decoder);
- if (wrapper == null) {
- wrapper = isKey ? new AvroKey<T>(datum) : new AvroValue<T>(datum);
- } else {
- wrapper.datum(datum);
- }
- return wrapper;
- }
-
- public void close() throws IOException {
- decoder.inputStream().close();
- }
- }
-
- /** Returns the specified output serializer. */
- public Serializer<AvroWrapper<T>> getSerializer(Class<AvroWrapper<T>> c) {
- // AvroWrapper used for final output, AvroKey or AvroValue for map output
- boolean isFinalOutput = c.equals(AvroWrapper.class);
- Configuration conf = getConf();
- Schema schema = isFinalOutput ? AvroJob.getOutputSchema(conf) : (AvroKey.class.isAssignableFrom(c) ? Pair
- .getKeySchema(AvroJob.getMapOutputSchema(conf)) : Pair.getValueSchema(AvroJob.getMapOutputSchema(conf)));
-
- ReflectDataFactory factory = Avros.getReflectDataFactory(conf);
- ReflectDatumWriter<T> writer = factory.getWriter(schema);
- return new AvroWrapperSerializer(writer);
- }
-
- private class AvroWrapperSerializer implements Serializer<AvroWrapper<T>> {
- private DatumWriter<T> writer;
- private OutputStream out;
- private BinaryEncoder encoder;
-
- public AvroWrapperSerializer(DatumWriter<T> writer) {
- this.writer = writer;
- }
-
- public void open(OutputStream out) {
- this.out = out;
- this.encoder = new EncoderFactory().configureBlockSize(512).binaryEncoder(out, null);
- }
-
- public void serialize(AvroWrapper<T> wrapper) throws IOException {
- writer.write(wrapper.datum(), encoder);
- // would be a lot faster if the Serializer interface had a flush()
- // method and the Hadoop framework called it when needed rather
- // than for every record.
- encoder.flush();
- }
-
- public void close() throws IOException {
- out.close();
- }
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/package-info.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/package-info.java b/crunch/src/main/java/org/apache/crunch/types/avro/package-info.java
deleted file mode 100644
index abaf60f..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Business object serialization using Apache Avro.
- */
-package org.apache.crunch.types.avro;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/package-info.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/package-info.java b/crunch/src/main/java/org/apache/crunch/types/package-info.java
deleted file mode 100644
index b420b03..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Common functionality for business object serialization.
- */
-package org.apache.crunch.types;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/writable/GenericArrayWritable.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/writable/GenericArrayWritable.java b/crunch/src/main/java/org/apache/crunch/types/writable/GenericArrayWritable.java
deleted file mode 100644
index 8b54008..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/writable/GenericArrayWritable.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.Arrays;
-
-import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableFactories;
-import org.apache.hadoop.io.WritableUtils;
-
-/**
- * A {@link Writable} for marshalling/unmarshalling Collections. Note that
- * element order is <em>undefined</em>!
- *
- * @param <T> The value type
- */
-class GenericArrayWritable<T> implements Writable {
- private Writable[] values;
- private Class<? extends Writable> valueClass;
-
- public GenericArrayWritable(Class<? extends Writable> valueClass) {
- this.valueClass = valueClass;
- }
-
- public GenericArrayWritable() {
- // for deserialization
- }
-
- public void set(Writable[] values) {
- this.values = values;
- }
-
- public Writable[] get() {
- return values;
- }
-
- public void readFields(DataInput in) throws IOException {
- values = new Writable[WritableUtils.readVInt(in)]; // construct values
- if (values.length > 0) {
- int nulls = WritableUtils.readVInt(in);
- if (nulls == values.length) {
- return;
- }
- String valueType = Text.readString(in);
- setValueType(valueType);
- for (int i = 0; i < values.length - nulls; i++) {
- Writable value = WritableFactories.newInstance(valueClass);
- value.readFields(in); // read a value
- values[i] = value; // store it in values
- }
- }
- }
-
- protected void setValueType(String valueType) {
- if (valueClass == null) {
- try {
- valueClass = Class.forName(valueType).asSubclass(Writable.class);
- } catch (ClassNotFoundException e) {
- throw new CrunchRuntimeException(e);
- }
- } else if (!valueType.equals(valueClass.getName())) {
- throw new IllegalStateException("Incoming " + valueType + " is not " + valueClass);
- }
- }
-
- public void write(DataOutput out) throws IOException {
- WritableUtils.writeVInt(out, values.length);
- if (values.length > 0) {
- int nulls = 0;
- for (int i = 0; i < values.length; i++) {
- if (values[i] == null) {
- nulls++;
- }
- }
- WritableUtils.writeVInt(out, nulls);
- if (values.length - nulls > 0) {
- if (valueClass == null) {
- throw new IllegalStateException("Value class not set by constructor or read");
- }
- Text.writeString(out, valueClass.getName());
- for (int i = 0; i < values.length; i++) {
- if (values[i] != null) {
- values[i].write(out);
- }
- }
- }
- }
- }
-
- @Override
- public int hashCode() {
- HashCodeBuilder hcb = new HashCodeBuilder();
- return hcb.append(values).toHashCode();
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
- return false;
- GenericArrayWritable other = (GenericArrayWritable) obj;
- if (!Arrays.equals(values, other.values))
- return false;
- return true;
- }
-
- @Override
- public String toString() {
- return Arrays.toString(values);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/writable/TextMapWritable.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/writable/TextMapWritable.java b/crunch/src/main/java/org/apache/crunch/types/writable/TextMapWritable.java
deleted file mode 100644
index 1ab51df..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/writable/TextMapWritable.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableUtils;
-
-import com.google.common.collect.Maps;
-
-class TextMapWritable<T extends Writable> implements Writable {
-
- private Class<T> valueClazz;
- private final Map<Text, T> instance;
-
- public TextMapWritable() {
- this.instance = Maps.newHashMap();
- }
-
- public TextMapWritable(Class<T> valueClazz) {
- this.valueClazz = valueClazz;
- this.instance = Maps.newHashMap();
- }
-
- public void put(Text txt, T value) {
- instance.put(txt, value);
- }
-
- public Set<Map.Entry<Text, T>> entrySet() {
- return instance.entrySet();
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- instance.clear();
- try {
- this.valueClazz = (Class<T>) Class.forName(Text.readString(in));
- } catch (ClassNotFoundException e) {
- throw (IOException) new IOException("Failed map init").initCause(e);
- }
- int entries = WritableUtils.readVInt(in);
- try {
- for (int i = 0; i < entries; i++) {
- Text txt = new Text();
- txt.readFields(in);
- T value = valueClazz.newInstance();
- value.readFields(in);
- instance.put(txt, value);
- }
- } catch (IllegalAccessException e) {
- throw (IOException) new IOException("Failed map init").initCause(e);
- } catch (InstantiationException e) {
- throw (IOException) new IOException("Failed map init").initCause(e);
- }
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- Text.writeString(out, valueClazz.getName());
- WritableUtils.writeVInt(out, instance.size());
- for (Map.Entry<Text, T> e : instance.entrySet()) {
- e.getKey().write(out);
- e.getValue().write(out);
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/writable/TupleWritable.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/writable/TupleWritable.java b/crunch/src/main/java/org/apache/crunch/types/writable/TupleWritable.java
deleted file mode 100644
index 1c3536b..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/writable/TupleWritable.java
+++ /dev/null
@@ -1,224 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-
-import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.io.WritableUtils;
-
-/**
- * A straight copy of the TupleWritable implementation in the join package,
- * added here because of its package visibility restrictions.
- *
- */
-public class TupleWritable implements WritableComparable<TupleWritable> {
-
- private long written;
- private Writable[] values;
-
- /**
- * Create an empty tuple with no allocated storage for writables.
- */
- public TupleWritable() {
- }
-
- /**
- * Initialize tuple with storage; unknown whether any of them contain
- * "written" values.
- */
- public TupleWritable(Writable[] vals) {
- written = 0L;
- values = vals;
- }
-
- /**
- * Return true if tuple has an element at the position provided.
- */
- public boolean has(int i) {
- return 0 != ((1 << i) & written);
- }
-
- /**
- * Get ith Writable from Tuple.
- */
- public Writable get(int i) {
- return values[i];
- }
-
- /**
- * The number of children in this Tuple.
- */
- public int size() {
- return values.length;
- }
-
- /**
- * {@inheritDoc}
- */
- public boolean equals(Object other) {
- if (other instanceof TupleWritable) {
- TupleWritable that = (TupleWritable) other;
- if (this.size() != that.size() || this.written != that.written) {
- return false;
- }
- for (int i = 0; i < values.length; ++i) {
- if (!has(i))
- continue;
- if (!values[i].equals(that.get(i))) {
- return false;
- }
- }
- return true;
- }
- return false;
- }
-
- public int hashCode() {
- HashCodeBuilder builder = new HashCodeBuilder();
- builder.append(written);
- for (Writable v : values) {
- builder.append(v);
- }
- return builder.toHashCode();
- }
-
- /**
- * Convert Tuple to String as in the following.
- * <tt>[<child1>,<child2>,...,<childn>]</tt>
- */
- public String toString() {
- StringBuffer buf = new StringBuffer("[");
- for (int i = 0; i < values.length; ++i) {
- buf.append(has(i) ? values[i].toString() : "");
- buf.append(",");
- }
- if (values.length != 0)
- buf.setCharAt(buf.length() - 1, ']');
- else
- buf.append(']');
- return buf.toString();
- }
-
- /**
- * Writes each Writable to <code>out</code>. TupleWritable format:
- * {@code
- * <count><type1><type2>...<typen><obj1><obj2>...<objn>
- * }
- */
- public void write(DataOutput out) throws IOException {
- WritableUtils.writeVInt(out, values.length);
- WritableUtils.writeVLong(out, written);
- for (int i = 0; i < values.length; ++i) {
- if (has(i)) {
- Text.writeString(out, values[i].getClass().getName());
- }
- }
- for (int i = 0; i < values.length; ++i) {
- if (has(i)) {
- values[i].write(out);
- }
- }
- }
-
- /**
- * {@inheritDoc}
- */
- @SuppressWarnings("unchecked")
- // No static typeinfo on Tuples
- public void readFields(DataInput in) throws IOException {
- int card = WritableUtils.readVInt(in);
- values = new Writable[card];
- written = WritableUtils.readVLong(in);
- Class<? extends Writable>[] cls = new Class[card];
- try {
- for (int i = 0; i < card; ++i) {
- if (has(i)) {
- cls[i] = Class.forName(Text.readString(in)).asSubclass(Writable.class);
- }
- }
- for (int i = 0; i < card; ++i) {
- if (has(i)) {
- values[i] = cls[i].newInstance();
- values[i].readFields(in);
- }
- }
- } catch (ClassNotFoundException e) {
- throw (IOException) new IOException("Failed tuple init").initCause(e);
- } catch (IllegalAccessException e) {
- throw (IOException) new IOException("Failed tuple init").initCause(e);
- } catch (InstantiationException e) {
- throw (IOException) new IOException("Failed tuple init").initCause(e);
- }
- }
-
- /**
- * Record that the tuple contains an element at the position provided.
- */
- public void setWritten(int i) {
- written |= 1 << i;
- }
-
- /**
- * Record that the tuple does not contain an element at the position provided.
- */
- public void clearWritten(int i) {
- written &= -1 ^ (1 << i);
- }
-
- /**
- * Clear any record of which writables have been written to, without releasing
- * storage.
- */
- public void clearWritten() {
- written = 0L;
- }
-
- @Override
- public int compareTo(TupleWritable o) {
- for (int i = 0; i < values.length; ++i) {
- if (has(i) && !o.has(i)) {
- return 1;
- } else if (!has(i) && o.has(i)) {
- return -1;
- } else {
- Writable v1 = values[i];
- Writable v2 = o.values[i];
- if (v1 != v2 && (v1 != null && !v1.equals(v2))) {
- if (v1 instanceof WritableComparable && v2 instanceof WritableComparable) {
- int cmp = ((WritableComparable) v1).compareTo((WritableComparable) v2);
- if (cmp != 0) {
- return cmp;
- }
- } else {
- int cmp = v1.hashCode() - v2.hashCode();
- if (cmp != 0) {
- return cmp;
- }
- }
- }
- }
- }
- return values.length - o.values.length;
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/writable/WritableDeepCopier.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/writable/WritableDeepCopier.java b/crunch/src/main/java/org/apache/crunch/types/writable/WritableDeepCopier.java
deleted file mode 100644
index 7b6e11b..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/writable/WritableDeepCopier.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.types.DeepCopier;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Writable;
-
-/**
- * Performs deep copies of Writable values.
- *
- * @param <T> The type of Writable that can be copied
- */
-public class WritableDeepCopier<T extends Writable> implements DeepCopier<T> {
-
- private Class<T> writableClass;
-
- public WritableDeepCopier(Class<T> writableClass) {
- this.writableClass = writableClass;
- }
-
- @Override
- public void initialize(Configuration conf) {
- }
-
- @Override
- public T deepCopy(T source) {
-
- if (source == null) {
- return null;
- }
-
- ByteArrayOutputStream byteOutStream = new ByteArrayOutputStream();
- DataOutputStream dataOut = new DataOutputStream(byteOutStream);
- T copiedValue = null;
- try {
- source.write(dataOut);
- dataOut.flush();
- ByteArrayInputStream byteInStream = new ByteArrayInputStream(byteOutStream.toByteArray());
- DataInput dataInput = new DataInputStream(byteInStream);
- copiedValue = writableClass.newInstance();
- copiedValue.readFields(dataInput);
- } catch (Exception e) {
- throw new CrunchRuntimeException("Error while deep copying " + source, e);
- }
- return copiedValue;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/writable/WritableGroupedTableType.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/writable/WritableGroupedTableType.java b/crunch/src/main/java/org/apache/crunch/types/writable/WritableGroupedTableType.java
deleted file mode 100644
index 84318d3..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/writable/WritableGroupedTableType.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import org.apache.crunch.GroupingOptions;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Pair;
-import org.apache.crunch.lib.PTables;
-import org.apache.crunch.types.Converter;
-import org.apache.crunch.types.PGroupedTableType;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.Job;
-
-class WritableGroupedTableType<K, V> extends PGroupedTableType<K, V> {
-
- private final MapFn inputFn;
- private final MapFn outputFn;
- private final Converter converter;
-
- public WritableGroupedTableType(WritableTableType<K, V> tableType) {
- super(tableType);
- WritableType keyType = (WritableType) tableType.getKeyType();
- WritableType valueType = (WritableType) tableType.getValueType();
- this.inputFn = new PairIterableMapFn(keyType.getInputMapFn(), valueType.getInputMapFn());
- this.outputFn = tableType.getOutputMapFn();
- this.converter = new WritablePairConverter(keyType.getSerializationClass(),
- valueType.getSerializationClass());
- }
-
- @Override
- public Class<Pair<K, Iterable<V>>> getTypeClass() {
- return (Class<Pair<K, Iterable<V>>>) Pair.of(null, null).getClass();
- }
-
- @Override
- public Converter getGroupingConverter() {
- return converter;
- }
-
- @Override
- public MapFn getInputMapFn() {
- return inputFn;
- }
-
- @Override
- public MapFn getOutputMapFn() {
- return outputFn;
- }
-
- @Override
- public void initialize(Configuration conf) {
- this.tableType.initialize(conf);
- }
-
- @Override
- public Pair<K, Iterable<V>> getDetachedValue(Pair<K, Iterable<V>> value) {
- return PTables.getGroupedDetachedValue(this, value);
- }
-
- @Override
- public void configureShuffle(Job job, GroupingOptions options) {
- if (options != null) {
- options.configure(job);
- }
- WritableType keyType = (WritableType) tableType.getKeyType();
- WritableType valueType = (WritableType) tableType.getValueType();
- job.setMapOutputKeyClass(keyType.getSerializationClass());
- job.setMapOutputValueClass(valueType.getSerializationClass());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/writable/WritablePairConverter.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/writable/WritablePairConverter.java b/crunch/src/main/java/org/apache/crunch/types/writable/WritablePairConverter.java
deleted file mode 100644
index 2db0238..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/writable/WritablePairConverter.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.Converter;
-
-class WritablePairConverter<K, V> implements Converter<K, V, Pair<K, V>, Pair<K, Iterable<V>>> {
-
- private final Class<K> keyClass;
- private final Class<V> valueClass;
-
- public WritablePairConverter(Class<K> keyClass, Class<V> valueClass) {
- this.keyClass = keyClass;
- this.valueClass = valueClass;
- }
-
- @Override
- public Pair<K, V> convertInput(K key, V value) {
- return Pair.of(key, value);
- }
-
- @Override
- public K outputKey(Pair<K, V> value) {
- return value.first();
- }
-
- @Override
- public V outputValue(Pair<K, V> value) {
- return value.second();
- }
-
- @Override
- public Class<K> getKeyClass() {
- return keyClass;
- }
-
- @Override
- public Class<V> getValueClass() {
- return valueClass;
- }
-
- @Override
- public Pair<K, Iterable<V>> convertIterableInput(K key, Iterable<V> value) {
- return Pair.of(key, value);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/writable/WritableTableType.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/writable/WritableTableType.java b/crunch/src/main/java/org/apache/crunch/types/writable/WritableTableType.java
deleted file mode 100644
index 93e0fd6..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/writable/WritableTableType.java
+++ /dev/null
@@ -1,130 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import java.util.List;
-
-import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Pair;
-import org.apache.crunch.fn.PairMapFn;
-import org.apache.crunch.io.ReadableSourceTarget;
-import org.apache.crunch.io.seq.SeqFileTableSourceTarget;
-import org.apache.crunch.lib.PTables;
-import org.apache.crunch.types.Converter;
-import org.apache.crunch.types.PGroupedTableType;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Writable;
-
-import com.google.common.collect.ImmutableList;
-
-class WritableTableType<K, V> implements PTableType<K, V> {
-
- private final WritableType<K, Writable> keyType;
- private final WritableType<V, Writable> valueType;
- private final MapFn inputFn;
- private final MapFn outputFn;
- private final Converter converter;
-
- public WritableTableType(WritableType<K, Writable> keyType, WritableType<V, Writable> valueType) {
- this.keyType = keyType;
- this.valueType = valueType;
- this.inputFn = new PairMapFn(keyType.getInputMapFn(), valueType.getInputMapFn());
- this.outputFn = new PairMapFn(keyType.getOutputMapFn(), valueType.getOutputMapFn());
- this.converter = new WritablePairConverter(keyType.getSerializationClass(),
- valueType.getSerializationClass());
- }
-
- @Override
- public Class<Pair<K, V>> getTypeClass() {
- return (Class<Pair<K, V>>) Pair.of(null, null).getClass();
- }
-
- @Override
- public List<PType> getSubTypes() {
- return ImmutableList.<PType> of(keyType, valueType);
- }
-
- @Override
- public MapFn getInputMapFn() {
- return inputFn;
- }
-
- @Override
- public MapFn getOutputMapFn() {
- return outputFn;
- }
-
- @Override
- public Converter getConverter() {
- return converter;
- }
-
- @Override
- public PTypeFamily getFamily() {
- return WritableTypeFamily.getInstance();
- }
-
- public PType<K> getKeyType() {
- return keyType;
- }
-
- public PType<V> getValueType() {
- return valueType;
- }
-
- @Override
- public PGroupedTableType<K, V> getGroupedTableType() {
- return new WritableGroupedTableType<K, V>(this);
- }
-
- @Override
- public ReadableSourceTarget<Pair<K, V>> getDefaultFileSource(Path path) {
- return new SeqFileTableSourceTarget<K, V>(path, this);
- }
-
- @Override
- public void initialize(Configuration conf) {
- keyType.initialize(conf);
- valueType.initialize(conf);
- }
-
- @Override
- public Pair<K, V> getDetachedValue(Pair<K, V> value) {
- return PTables.getDetachedValue(this, value);
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == null || !(obj instanceof WritableTableType)) {
- return false;
- }
- WritableTableType that = (WritableTableType) obj;
- return keyType.equals(that.keyType) && valueType.equals(that.valueType);
- }
-
- @Override
- public int hashCode() {
- HashCodeBuilder hcb = new HashCodeBuilder();
- return hcb.append(keyType).append(valueType).toHashCode();
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/writable/WritableType.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/writable/WritableType.java b/crunch/src/main/java/org/apache/crunch/types/writable/WritableType.java
deleted file mode 100644
index 734946c..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/writable/WritableType.java
+++ /dev/null
@@ -1,133 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import java.util.List;
-
-import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.io.ReadableSourceTarget;
-import org.apache.crunch.io.seq.SeqFileSourceTarget;
-import org.apache.crunch.types.Converter;
-import org.apache.crunch.types.DeepCopier;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Writable;
-
-import com.google.common.collect.ImmutableList;
-
-public class WritableType<T, W extends Writable> implements PType<T> {
-
- private final Class<T> typeClass;
- private final Class<W> writableClass;
- private final Converter converter;
- private final MapFn<W, T> inputFn;
- private final MapFn<T, W> outputFn;
- private final DeepCopier<W> deepCopier;
- private final List<PType> subTypes;
- private boolean initialized = false;
-
- public WritableType(Class<T> typeClass, Class<W> writableClass, MapFn<W, T> inputDoFn,
- MapFn<T, W> outputDoFn, PType... subTypes) {
- this.typeClass = typeClass;
- this.writableClass = writableClass;
- this.inputFn = inputDoFn;
- this.outputFn = outputDoFn;
- this.converter = new WritableValueConverter(writableClass);
- this.deepCopier = new WritableDeepCopier<W>(writableClass);
- this.subTypes = ImmutableList.<PType> builder().add(subTypes).build();
- }
-
- @Override
- public PTypeFamily getFamily() {
- return WritableTypeFamily.getInstance();
- }
-
- @Override
- public Class<T> getTypeClass() {
- return typeClass;
- }
-
- @Override
- public Converter getConverter() {
- return converter;
- }
-
- @Override
- public MapFn getInputMapFn() {
- return inputFn;
- }
-
- @Override
- public MapFn getOutputMapFn() {
- return outputFn;
- }
-
- @Override
- public List<PType> getSubTypes() {
- return subTypes;
- }
-
- public Class<W> getSerializationClass() {
- return writableClass;
- }
-
- @Override
- public ReadableSourceTarget<T> getDefaultFileSource(Path path) {
- return new SeqFileSourceTarget<T>(path, this);
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == null || !(obj instanceof WritableType)) {
- return false;
- }
- WritableType wt = (WritableType) obj;
- return (typeClass.equals(wt.typeClass) && writableClass.equals(wt.writableClass) && subTypes
- .equals(wt.subTypes));
- }
-
- @Override
- public void initialize(Configuration conf) {
- this.inputFn.initialize();
- this.outputFn.initialize();
- for (PType subType : subTypes) {
- subType.initialize(conf);
- }
- this.initialized = true;
- }
-
- @Override
- public T getDetachedValue(T value) {
- if (!initialized) {
- throw new IllegalStateException("Cannot call getDetachedValue on an uninitialized PType");
- }
- W writableValue = outputFn.map(value);
- W deepCopy = this.deepCopier.deepCopy(writableValue);
- return inputFn.map(deepCopy);
- }
-
- @Override
- public int hashCode() {
- HashCodeBuilder hcb = new HashCodeBuilder();
- hcb.append(typeClass).append(writableClass).append(subTypes);
- return hcb.toHashCode();
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/writable/WritableTypeFamily.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/writable/WritableTypeFamily.java b/crunch/src/main/java/org/apache/crunch/types/writable/WritableTypeFamily.java
deleted file mode 100644
index a94db96..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/writable/WritableTypeFamily.java
+++ /dev/null
@@ -1,147 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.Map;
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-import org.apache.crunch.types.PGroupedTableType;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.PTypeUtils;
-import org.apache.hadoop.io.Writable;
-
-/**
- * The {@link Writable}-based implementation of the
- * {@link org.apache.crunch.types.PTypeFamily} interface.
- */
-public class WritableTypeFamily implements PTypeFamily {
-
- private static final WritableTypeFamily INSTANCE = new WritableTypeFamily();
-
- public static WritableTypeFamily getInstance() {
- return INSTANCE;
- }
-
- // Disallow construction
- private WritableTypeFamily() {
- }
-
- public PType<Void> nulls() {
- return Writables.nulls();
- }
-
- public PType<String> strings() {
- return Writables.strings();
- }
-
- public PType<Long> longs() {
- return Writables.longs();
- }
-
- public PType<Integer> ints() {
- return Writables.ints();
- }
-
- public PType<Float> floats() {
- return Writables.floats();
- }
-
- public PType<Double> doubles() {
- return Writables.doubles();
- }
-
- public PType<Boolean> booleans() {
- return Writables.booleans();
- }
-
- public PType<ByteBuffer> bytes() {
- return Writables.bytes();
- }
-
- public <T> PType<T> records(Class<T> clazz) {
- return Writables.records(clazz);
- }
-
- public <W extends Writable> PType<W> writables(Class<W> clazz) {
- return Writables.writables(clazz);
- }
-
- public <K, V> PTableType<K, V> tableOf(PType<K> key, PType<V> value) {
- return Writables.tableOf(key, value);
- }
-
- public <V1, V2> PType<Pair<V1, V2>> pairs(PType<V1> p1, PType<V2> p2) {
- return Writables.pairs(p1, p2);
- }
-
- public <V1, V2, V3> PType<Tuple3<V1, V2, V3>> triples(PType<V1> p1, PType<V2> p2, PType<V3> p3) {
- return Writables.triples(p1, p2, p3);
- }
-
- public <V1, V2, V3, V4> PType<Tuple4<V1, V2, V3, V4>> quads(PType<V1> p1, PType<V2> p2, PType<V3> p3, PType<V4> p4) {
- return Writables.quads(p1, p2, p3, p4);
- }
-
- public PType<TupleN> tuples(PType<?>... ptypes) {
- return Writables.tuples(ptypes);
- }
-
- public <T> PType<Collection<T>> collections(PType<T> ptype) {
- return Writables.collections(ptype);
- }
-
- public <T> PType<Map<String, T>> maps(PType<T> ptype) {
- return Writables.maps(ptype);
- }
-
- @Override
- public <T> PType<T> as(PType<T> ptype) {
- if (ptype instanceof WritableType || ptype instanceof WritableTableType
- || ptype instanceof WritableGroupedTableType) {
- return ptype;
- }
- if (ptype instanceof PGroupedTableType) {
- PTableType ptt = ((PGroupedTableType) ptype).getTableType();
- return new WritableGroupedTableType((WritableTableType) as(ptt));
- }
- PType<T> prim = Writables.getPrimitiveType(ptype.getTypeClass());
- if (prim != null) {
- return prim;
- }
- return PTypeUtils.convert(ptype, this);
- }
-
- @Override
- public <T extends Tuple> PType<T> tuples(Class<T> clazz, PType<?>... ptypes) {
- return Writables.tuples(clazz, ptypes);
- }
-
- @Override
- public <S, T> PType<T> derived(Class<T> clazz, MapFn<S, T> inputFn, MapFn<T, S> outputFn, PType<S> base) {
- return Writables.derived(clazz, inputFn, outputFn, base);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/writable/WritableValueConverter.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/writable/WritableValueConverter.java b/crunch/src/main/java/org/apache/crunch/types/writable/WritableValueConverter.java
deleted file mode 100644
index 3670b90..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/writable/WritableValueConverter.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import org.apache.crunch.types.Converter;
-import org.apache.hadoop.io.NullWritable;
-
-class WritableValueConverter<W> implements Converter<Object, W, W, Iterable<W>> {
-
- private final Class<W> serializationClass;
-
- public WritableValueConverter(Class<W> serializationClass) {
- this.serializationClass = serializationClass;
- }
-
- @Override
- public W convertInput(Object key, W value) {
- return value;
- }
-
- @Override
- public Object outputKey(W value) {
- return NullWritable.get();
- }
-
- @Override
- public W outputValue(W value) {
- return value;
- }
-
- @Override
- public Class<Object> getKeyClass() {
- return (Class<Object>) (Class<?>) NullWritable.class;
- }
-
- @Override
- public Class<W> getValueClass() {
- return serializationClass;
- }
-
- @Override
- public Iterable<W> convertIterableInput(Object key, Iterable<W> value) {
- return value;
- }
-}
\ No newline at end of file
[03/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/fn/AggregatorsTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/fn/AggregatorsTest.java b/crunch/src/test/java/org/apache/crunch/fn/AggregatorsTest.java
deleted file mode 100644
index 6ee1972..0000000
--- a/crunch/src/test/java/org/apache/crunch/fn/AggregatorsTest.java
+++ /dev/null
@@ -1,239 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import static org.apache.crunch.fn.Aggregators.MAX_BIGINTS;
-import static org.apache.crunch.fn.Aggregators.MAX_DOUBLES;
-import static org.apache.crunch.fn.Aggregators.MAX_FLOATS;
-import static org.apache.crunch.fn.Aggregators.MAX_INTS;
-import static org.apache.crunch.fn.Aggregators.MAX_LONGS;
-import static org.apache.crunch.fn.Aggregators.MAX_N;
-import static org.apache.crunch.fn.Aggregators.MIN_BIGINTS;
-import static org.apache.crunch.fn.Aggregators.MIN_DOUBLES;
-import static org.apache.crunch.fn.Aggregators.MIN_FLOATS;
-import static org.apache.crunch.fn.Aggregators.MIN_INTS;
-import static org.apache.crunch.fn.Aggregators.MIN_LONGS;
-import static org.apache.crunch.fn.Aggregators.MIN_N;
-import static org.apache.crunch.fn.Aggregators.STRING_CONCAT;
-import static org.apache.crunch.fn.Aggregators.SUM_BIGINTS;
-import static org.apache.crunch.fn.Aggregators.SUM_DOUBLES;
-import static org.apache.crunch.fn.Aggregators.SUM_FLOATS;
-import static org.apache.crunch.fn.Aggregators.SUM_INTS;
-import static org.apache.crunch.fn.Aggregators.SUM_LONGS;
-import static org.hamcrest.Matchers.closeTo;
-import static org.hamcrest.Matchers.is;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertThat;
-
-import java.math.BigInteger;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.crunch.Aggregator;
-import org.apache.crunch.CombineFn;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-import org.apache.crunch.impl.mem.emit.InMemoryEmitter;
-import org.junit.Test;
-
-import com.google.common.base.Function;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableSet;
-import com.google.common.collect.Iterables;
-
-
-public class AggregatorsTest {
-
- @Test
- public void testSums2() {
- assertThat(sapply(SUM_INTS(), 1, 2, 3, -4), is(2));
- assertThat(sapply(SUM_LONGS(), 1L, 2L, 3L, -4L, 5000000000L), is(5000000002L));
- assertThat(sapply(SUM_FLOATS(), 1f, 2f, 3f, -4f), is(2f));
- assertThat(sapply(SUM_DOUBLES(), 0.1, 0.2, 0.3), is(closeTo(0.6, 0.00001)));
- assertThat(sapply(SUM_BIGINTS(), bigInt("7"), bigInt("3")), is(bigInt("10")));
- }
-
- @Test
- public void testSums() {
- assertThat(sapply(SUM_LONGS(), 29L, 17L, 1729L), is(1775L));
- assertThat(sapply(SUM_LONGS(), 29L, 7L, 1729L), is(1765L));
- assertThat(sapply(SUM_INTS(), 29, 17, 1729), is(1775));
- assertThat(sapply(SUM_FLOATS(), 29f, 17f, 1729f), is(1775.0f));
- assertThat(sapply(SUM_DOUBLES(), 29.0, 17.0, 1729.0), is(1775.0));
- assertThat(sapply(SUM_BIGINTS(), bigInt("29"), bigInt("17"), bigInt("1729")), is(bigInt("1775")));
- }
-
- @Test
- public void testMax() {
- assertThat(sapply(MAX_LONGS(), 29L, 17L, 1729L), is(1729L));
- assertThat(sapply(MAX_INTS(), 29, 17, 1729), is(1729));
- assertThat(sapply(MAX_FLOATS(), 29f, 17f, 1729f), is(1729.0f));
- assertThat(sapply(MAX_DOUBLES(), 29.0, 17.0, 1729.0), is(1729.0));
- assertThat(sapply(MAX_FLOATS(), 29f, 1745f, 17f, 1729f), is(1745.0f));
- assertThat(sapply(MAX_BIGINTS(), bigInt("29"), bigInt("17"), bigInt("1729")), is(bigInt("1729")));
- }
-
- @Test
- public void testMin() {
- assertThat(sapply(MIN_LONGS(), 29L, 17L, 1729L), is(17L));
- assertThat(sapply(MIN_INTS(), 29, 17, 1729), is(17));
- assertThat(sapply(MIN_FLOATS(), 29f, 17f, 1729f), is(17.0f));
- assertThat(sapply(MIN_DOUBLES(), 29.0, 17.0, 1729.0), is(17.0));
- assertThat(sapply(MIN_INTS(), 29, 170, 1729), is(29));
- assertThat(sapply(MIN_BIGINTS(), bigInt("29"), bigInt("17"), bigInt("1729")), is(bigInt("17")));
- }
-
- @Test
- public void testMaxN() {
- assertThat(apply(MAX_INTS(2), 17, 34, 98, 29, 1009), is(ImmutableList.of(98, 1009)));
- assertThat(apply(MAX_N(1, String.class), "b", "a"), is(ImmutableList.of("b")));
- assertThat(apply(MAX_N(3, String.class), "b", "a", "d", "c"), is(ImmutableList.of("b", "c", "d")));
- }
-
- @Test
- public void testMinN() {
- assertThat(apply(MIN_INTS(2), 17, 34, 98, 29, 1009), is(ImmutableList.of(17, 29)));
- assertThat(apply(MIN_N(1, String.class), "b", "a"), is(ImmutableList.of("a")));
- assertThat(apply(MIN_N(3, String.class), "b", "a", "d", "c"), is(ImmutableList.of("a", "b", "c")));
- }
-
- @Test
- public void testFirstN() {
- assertThat(apply(Aggregators.<Integer>FIRST_N(2), 17, 34, 98, 29, 1009), is(ImmutableList.of(17, 34)));
- }
-
- @Test
- public void testLastN() {
- assertThat(apply(Aggregators.<Integer>LAST_N(2), 17, 34, 98, 29, 1009), is(ImmutableList.of(29, 1009)));
- }
-
- @Test
- public void testUniqueElements() {
- assertThat(ImmutableSet.copyOf(apply(Aggregators.<Integer>UNIQUE_ELEMENTS(), 17, 29, 29, 16, 17)),
- is(ImmutableSet.of(17, 29, 16)));
-
- Iterable<Integer> samp = apply(Aggregators.<Integer>SAMPLE_UNIQUE_ELEMENTS(2), 17, 29, 16, 17, 29, 16);
- assertThat(Iterables.size(samp), is(2));
- assertThat(ImmutableSet.copyOf(samp).size(), is(2)); // check that the two elements are unique
- }
-
- @Test
- public void testPairs() {
- List<Pair<Long, Double>> input = ImmutableList.of(Pair.of(1720L, 17.29), Pair.of(9L, -3.14));
- Aggregator<Pair<Long, Double>> a = Aggregators.pairAggregator(SUM_LONGS(), MIN_DOUBLES());
-
- assertThat(sapply(a, input), is(Pair.of(1729L, -3.14)));
- }
-
- @Test
- public void testPairsTwoLongs() {
- List<Pair<Long, Long>> input = ImmutableList.of(Pair.of(1720L, 1L), Pair.of(9L, 19L));
- Aggregator<Pair<Long, Long>> a = Aggregators.pairAggregator(SUM_LONGS(), SUM_LONGS());
-
- assertThat(sapply(a, input), is(Pair.of(1729L, 20L)));
- }
-
- @Test
- public void testTrips() {
- List<Tuple3<Float, Double, Double>> input = ImmutableList.of(Tuple3.of(17.29f, 12.2, 0.1),
- Tuple3.of(3.0f, 1.2, 3.14), Tuple3.of(-1.0f, 14.5, -0.98));
- Aggregator<Tuple3<Float, Double, Double>> a = Aggregators.tripAggregator(
- MAX_FLOATS(), MAX_DOUBLES(), MIN_DOUBLES());
-
- assertThat(sapply(a, input), is(Tuple3.of(17.29f, 14.5, -0.98)));
- }
-
- @Test
- public void testQuads() {
- List<Tuple4<Float, Double, Double, Integer>> input = ImmutableList.of(Tuple4.of(17.29f, 12.2, 0.1, 1),
- Tuple4.of(3.0f, 1.2, 3.14, 2), Tuple4.of(-1.0f, 14.5, -0.98, 3));
- Aggregator<Tuple4<Float, Double, Double, Integer>> a = Aggregators.quadAggregator(
- MAX_FLOATS(), MAX_DOUBLES(), MIN_DOUBLES(), SUM_INTS());
-
- assertThat(sapply(a, input), is(Tuple4.of(17.29f, 14.5, -0.98, 6)));
- }
-
- @Test
- public void testTupleN() {
- List<TupleN> input = ImmutableList.of(new TupleN(1, 3.0, 1, 2.0, 4L), new TupleN(4, 17.0, 1, 9.7, 12L));
- Aggregator<TupleN> a = Aggregators.tupleAggregator(
- MIN_INTS(), SUM_DOUBLES(), MAX_INTS(), MIN_DOUBLES(), MAX_LONGS());
-
- assertThat(sapply(a, input), is(new TupleN(1, 20.0, 1, 2.0, 12L)));
- }
-
- @Test
- public void testConcatenation() {
- assertThat(sapply(STRING_CONCAT("", true), "foo", "foobar", "bar"), is("foofoobarbar"));
- assertThat(sapply(STRING_CONCAT("/", false), "foo", "foobar", "bar"), is("foo/foobar/bar"));
- assertThat(sapply(STRING_CONCAT(" ", true), " ", ""), is(" "));
- assertThat(sapply(STRING_CONCAT(" ", true), Arrays.asList(null, "")), is(""));
- assertThat(sapply(STRING_CONCAT(" ", true, 20, 3), "foo", "foobar", "bar"), is("foo bar"));
- assertThat(sapply(STRING_CONCAT(" ", true, 10, 6), "foo", "foobar", "bar"), is("foo foobar"));
- assertThat(sapply(STRING_CONCAT(" ", true, 9, 6), "foo", "foobar", "bar"), is("foo bar"));
- }
-
- @Test(expected = NullPointerException.class)
- public void testConcatenationNullException() {
- sapply(STRING_CONCAT(" ", false), Arrays.asList(null, "" ));
- }
-
-
- private static <T> T sapply(Aggregator<T> a, T... values) {
- return sapply(a, ImmutableList.copyOf(values));
- }
-
- private static <T> T sapply(Aggregator<T> a, Iterable<T> values) {
- return Iterables.getOnlyElement(apply(a, values));
- }
-
- private static <T> ImmutableList<T> apply(Aggregator<T> a, T... values) {
- return apply(a, ImmutableList.copyOf(values));
- }
-
- private static <T> ImmutableList<T> apply(Aggregator<T> a, Iterable<T> values) {
- CombineFn<String, T> fn = Aggregators.toCombineFn(a);
-
- InMemoryEmitter<Pair<String, T>> e1 = new InMemoryEmitter<Pair<String,T>>();
- fn.process(Pair.of("", values), e1);
-
- // and a second time to make sure Aggregator.reset() works
- InMemoryEmitter<Pair<String, T>> e2 = new InMemoryEmitter<Pair<String,T>>();
- fn.process(Pair.of("", values), e2);
-
- assertEquals(getValues(e1), getValues(e2));
-
- return getValues(e1);
- }
-
- private static <K, V> ImmutableList<V> getValues(InMemoryEmitter<Pair<K, V>> emitter) {
- return ImmutableList.copyOf(
- Iterables.transform(emitter.getOutput(), new Function<Pair<K, V>, V>() {
- @Override
- public V apply(Pair<K, V> input) {
- return input.second();
- }
- }));
- }
-
- private static BigInteger bigInt(String value) {
- return new BigInteger(value);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/fn/ExtractKeyFnTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/fn/ExtractKeyFnTest.java b/crunch/src/test/java/org/apache/crunch/fn/ExtractKeyFnTest.java
deleted file mode 100644
index b5b2a1b..0000000
--- a/crunch/src/test/java/org/apache/crunch/fn/ExtractKeyFnTest.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Pair;
-import org.junit.Test;
-
-@SuppressWarnings("serial")
-public class ExtractKeyFnTest {
-
- protected static final MapFn<String, Integer> mapFn = new MapFn<String, Integer>() {
- @Override
- public Integer map(String input) {
- return input.hashCode();
- }
- };
-
- protected static final ExtractKeyFn<Integer, String> one = new ExtractKeyFn<Integer, String>(mapFn);
-
- @Test
- public void test() {
- StoreLastEmitter<Pair<Integer, String>> emitter = StoreLastEmitter.create();
- one.process("boza", emitter);
- assertEquals(Pair.of("boza".hashCode(), "boza"), emitter.getLast());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/fn/FilterFnTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/fn/FilterFnTest.java b/crunch/src/test/java/org/apache/crunch/fn/FilterFnTest.java
deleted file mode 100644
index a649f99..0000000
--- a/crunch/src/test/java/org/apache/crunch/fn/FilterFnTest.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import static org.hamcrest.Matchers.is;
-import static org.junit.Assert.assertThat;
-
-import org.apache.crunch.FilterFn;
-import org.junit.Test;
-
-import com.google.common.base.Predicates;
-
-
-public class FilterFnTest {
-
- private static final FilterFn<String> TRUE = FilterFns.<String>ACCEPT_ALL();
- private static final FilterFn<String> FALSE = FilterFns.<String>REJECT_ALL();
-
- @Test
- public void testAcceptAll() {
- assertThat(TRUE.accept(""), is(true));
- assertThat(TRUE.accept("foo"), is(true));
- }
-
- @Test
- public void testRejectAll() {
- assertThat(FALSE.accept(""), is(false));
- assertThat(FALSE.accept("foo"), is(false));
-
- Predicates.or(Predicates.alwaysFalse(), Predicates.alwaysTrue());
- }
-
- @Test
- public void testAnd() {
- assertThat(FilterFns.and(TRUE, TRUE).accept("foo"), is(true));
- assertThat(FilterFns.and(TRUE, FALSE).accept("foo"), is(false));
- }
-
- @Test
- @SuppressWarnings("unchecked")
- public void testGeneric() {
- assertThat(FilterFns.and(TRUE).accept("foo"), is(true));
- assertThat(FilterFns.and(FALSE).accept("foo"), is(false));
- assertThat(FilterFns.and(FALSE, FALSE, FALSE).accept("foo"), is(false));
- assertThat(FilterFns.and(TRUE, TRUE, FALSE).accept("foo"), is(false));
- assertThat(FilterFns.and(FALSE, FALSE, FALSE, FALSE).accept("foo"), is(false));
- }
-
- @Test
- public void testOr() {
- assertThat(FilterFns.or(FALSE, TRUE).accept("foo"), is(true));
- assertThat(FilterFns.or(TRUE, FALSE).accept("foo"), is(true));
- }
-
- @Test
- @SuppressWarnings("unchecked")
- public void testOrGeneric() {
- assertThat(FilterFns.or(TRUE).accept("foo"), is(true));
- assertThat(FilterFns.or(FALSE).accept("foo"), is(false));
- assertThat(FilterFns.or(TRUE, FALSE, TRUE).accept("foo"), is(true));
- assertThat(FilterFns.or(FALSE, FALSE, TRUE).accept("foo"), is(true));
- assertThat(FilterFns.or(FALSE, FALSE, FALSE).accept("foo"), is(false));
- }
-
- @Test
- public void testNot() {
- assertThat(FilterFns.not(TRUE).accept("foo"), is(false));
- assertThat(FilterFns.not(FALSE).accept("foo"), is(true));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/fn/MapKeysTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/fn/MapKeysTest.java b/crunch/src/test/java/org/apache/crunch/fn/MapKeysTest.java
deleted file mode 100644
index 6b73700..0000000
--- a/crunch/src/test/java/org/apache/crunch/fn/MapKeysTest.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.crunch.Pair;
-import org.junit.Test;
-
-@SuppressWarnings("serial")
-public class MapKeysTest {
-
- protected static final MapKeysFn<String, Integer, Integer> one = new MapKeysFn<String, Integer, Integer>() {
- @Override
- public Integer map(String input) {
- return 1;
- }
- };
-
- protected static final MapKeysFn<String, Integer, Integer> two = new MapKeysFn<String, Integer, Integer>() {
- @Override
- public Integer map(String input) {
- return 2;
- }
- };
-
- @Test
- public void test() {
- StoreLastEmitter<Pair<Integer, Integer>> emitter = StoreLastEmitter.create();
- one.process(Pair.of("k", Integer.MAX_VALUE), emitter);
- assertEquals(Pair.of(1, Integer.MAX_VALUE), emitter.getLast());
- two.process(Pair.of("k", Integer.MAX_VALUE), emitter);
- assertEquals(Pair.of(2, Integer.MAX_VALUE), emitter.getLast());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/fn/MapValuesTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/fn/MapValuesTest.java b/crunch/src/test/java/org/apache/crunch/fn/MapValuesTest.java
deleted file mode 100644
index 097b008..0000000
--- a/crunch/src/test/java/org/apache/crunch/fn/MapValuesTest.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.crunch.Pair;
-import org.junit.Test;
-
-@SuppressWarnings("serial")
-public class MapValuesTest {
-
- static final MapValuesFn<String, String, Integer> one = new MapValuesFn<String, String, Integer>() {
- @Override
- public Integer map(String input) {
- return 1;
- }
- };
-
- static final MapValuesFn<String, String, Integer> two = new MapValuesFn<String, String, Integer>() {
- @Override
- public Integer map(String input) {
- return 2;
- }
- };
-
- @Test
- public void test() {
- StoreLastEmitter<Pair<String, Integer>> emitter = StoreLastEmitter.create();
- one.process(Pair.of("k", "v"), emitter);
- assertEquals(Pair.of("k", 1), emitter.getLast());
- two.process(Pair.of("k", "v"), emitter);
- assertEquals(Pair.of("k", 2), emitter.getLast());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/fn/PairMapTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/fn/PairMapTest.java b/crunch/src/test/java/org/apache/crunch/fn/PairMapTest.java
deleted file mode 100644
index bef6c85..0000000
--- a/crunch/src/test/java/org/apache/crunch/fn/PairMapTest.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import static org.junit.Assert.assertTrue;
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Pair;
-import org.junit.Test;
-
-@SuppressWarnings("serial")
-public class PairMapTest {
-
- static final MapFn<String, Integer> one = new MapFn<String, Integer>() {
- @Override
- public Integer map(String input) {
- return 1;
- }
- };
-
- static final MapFn<String, Integer> two = new MapFn<String, Integer>() {
- @Override
- public Integer map(String input) {
- return 2;
- }
- };
-
- @Test
- public void testPairMap() {
- StoreLastEmitter<Pair<Integer, Integer>> emitter = StoreLastEmitter.create();
- PairMapFn<String, String, Integer, Integer> fn = new PairMapFn<String, String, Integer, Integer>(one, two);
- fn.process(Pair.of("a", "b"), emitter);
- Pair<Integer, Integer> pair = emitter.getLast();
- assertTrue(pair.first() == 1);
- assertTrue(pair.second() == 2);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/fn/StoreLastEmitter.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/fn/StoreLastEmitter.java b/crunch/src/test/java/org/apache/crunch/fn/StoreLastEmitter.java
deleted file mode 100644
index cdd8754..0000000
--- a/crunch/src/test/java/org/apache/crunch/fn/StoreLastEmitter.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import org.apache.crunch.Emitter;
-
-class StoreLastEmitter<T> implements Emitter<T> {
- private T last;
-
- @Override
- public void emit(T emitted) {
- last = emitted;
- }
-
- public T getLast() {
- return last;
- }
-
- @Override
- public void flush() {
- }
-
- public static <T> StoreLastEmitter<T> create() {
- return new StoreLastEmitter<T>();
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/impl/SingleUseIterableTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/impl/SingleUseIterableTest.java b/crunch/src/test/java/org/apache/crunch/impl/SingleUseIterableTest.java
deleted file mode 100644
index 811a0a3..0000000
--- a/crunch/src/test/java/org/apache/crunch/impl/SingleUseIterableTest.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl;
-
-import static org.junit.Assert.assertEquals;
-
-import java.util.List;
-
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class SingleUseIterableTest {
-
- @Test
- public void testIterator() {
- List<Integer> values = Lists.newArrayList(1,2,3);
-
- SingleUseIterable<Integer> iterable = new SingleUseIterable<Integer>(values);
-
- List<Integer> retrievedValues = Lists.newArrayList(iterable);
-
- assertEquals(values, retrievedValues);
- }
-
- @Test(expected=IllegalStateException.class)
- public void testIterator_MultipleCalls() {
- List<Integer> values = Lists.newArrayList(1,2,3);
-
- SingleUseIterable<Integer> iterable = new SingleUseIterable<Integer>(values);
-
- List<Integer> retrievedValues = Lists.newArrayList(iterable);
-
- for (Integer n : iterable) {
-
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/impl/mr/MRPipelineTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/impl/mr/MRPipelineTest.java b/crunch/src/test/java/org/apache/crunch/impl/mr/MRPipelineTest.java
deleted file mode 100644
index 9ed7a46..0000000
--- a/crunch/src/test/java/org/apache/crunch/impl/mr/MRPipelineTest.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr;
-
-import static org.junit.Assert.assertEquals;
-import static org.mockito.Mockito.doReturn;
-import static org.mockito.Mockito.spy;
-import static org.mockito.Mockito.when;
-
-import java.io.IOException;
-
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.impl.mr.collect.PCollectionImpl;
-import org.apache.crunch.impl.mr.run.RuntimeParameters;
-import org.apache.crunch.io.ReadableSourceTarget;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.hadoop.conf.Configuration;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
-import org.junit.runner.RunWith;
-import org.mockito.Mock;
-import org.mockito.runners.MockitoJUnitRunner;
-
-
-@RunWith(MockitoJUnitRunner.class)
-public class MRPipelineTest {
- @Rule
- public TemporaryFolder tempDir = new TemporaryFolder();
- @Mock
- private PCollectionImpl<String> pcollection;
- @Mock
- private ReadableSourceTarget<String> readableSourceTarget;
- @Mock
- private SourceTarget<String> nonReadableSourceTarget;
- private MRPipeline pipeline;
-
- @Before
- public void setUp() throws IOException {
- Configuration conf = new Configuration();
- conf.set(RuntimeParameters.TMP_DIR, tempDir.getRoot().getAbsolutePath());
- pipeline = spy(new MRPipeline(MRPipelineTest.class, conf));
- }
-
- @Test
- public void testGetMaterializeSourceTarget_AlreadyMaterialized() {
- when(pcollection.getMaterializedAt()).thenReturn(readableSourceTarget);
-
- assertEquals(readableSourceTarget, pipeline.getMaterializeSourceTarget(pcollection));
- }
-
- @Test
- public void testGetMaterializeSourceTarget_NotMaterialized_HasOutput() {
- when(pcollection.getPType()).thenReturn(Avros.strings());
- doReturn(readableSourceTarget).when(pipeline).createIntermediateOutput(Avros.strings());
- when(pcollection.getMaterializedAt()).thenReturn(null);
-
- assertEquals(readableSourceTarget, pipeline.getMaterializeSourceTarget(pcollection));
- }
-
- @Test(expected = IllegalArgumentException.class)
- public void testGetMaterializeSourceTarget_NotMaterialized_NotReadableSourceTarget() {
- when(pcollection.getPType()).thenReturn(Avros.strings());
- doReturn(nonReadableSourceTarget).when(pipeline).createIntermediateOutput(Avros.strings());
- when(pcollection.getMaterializedAt()).thenReturn(null);
-
- pipeline.getMaterializeSourceTarget(pcollection);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/impl/mr/collect/DoCollectionImplTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/impl/mr/collect/DoCollectionImplTest.java b/crunch/src/test/java/org/apache/crunch/impl/mr/collect/DoCollectionImplTest.java
deleted file mode 100644
index fd582bc..0000000
--- a/crunch/src/test/java/org/apache/crunch/impl/mr/collect/DoCollectionImplTest.java
+++ /dev/null
@@ -1,112 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.collect;
-
-import static org.junit.Assert.assertEquals;
-
-import java.util.List;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.impl.mr.plan.DoNode;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.writable.Writables;
-import org.junit.Test;
-
-public class DoCollectionImplTest {
-
- @Test
- public void testGetSizeInternal_NoScaleFactor() {
- runScaleTest(100L, 1.0f, 100L);
- }
-
- @Test
- public void testGetSizeInternal_ScaleFactorBelowZero() {
- runScaleTest(100L, 0.5f, 50L);
- }
-
- @Test
- public void testGetSizeInternal_ScaleFactorAboveZero() {
- runScaleTest(100L, 1.5f, 150L);
- }
-
- private void runScaleTest(long inputSize, float scaleFactor, long expectedScaledSize) {
- PCollectionImpl<String> parentCollection = new SizedPCollectionImpl("Sized collection", inputSize);
-
- DoCollectionImpl<String> doCollectionImpl = new DoCollectionImpl<String>("Scaled collection", parentCollection,
- new ScaledFunction(scaleFactor), Writables.strings());
-
- assertEquals(expectedScaledSize, doCollectionImpl.getSizeInternal());
- }
-
- static class ScaledFunction extends DoFn<String, String> {
-
- private float scaleFactor;
-
- public ScaledFunction(float scaleFactor) {
- this.scaleFactor = scaleFactor;
- }
-
- @Override
- public void process(String input, Emitter<String> emitter) {
- emitter.emit(input);
- }
-
- @Override
- public float scaleFactor() {
- return scaleFactor;
- }
-
- }
-
- static class SizedPCollectionImpl extends PCollectionImpl<String> {
-
- private long internalSize;
-
- public SizedPCollectionImpl(String name, long internalSize) {
- super(name);
- this.internalSize = internalSize;
- }
-
- @Override
- public PType getPType() {
- return null;
- }
-
- @Override
- public DoNode createDoNode() {
- return null;
- }
-
- @Override
- public List getParents() {
- return null;
- }
-
- @Override
- protected void acceptInternal(Visitor visitor) {
- }
-
- @Override
- protected long getSizeInternal() {
- return internalSize;
- }
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/impl/mr/collect/DoTableImplTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/impl/mr/collect/DoTableImplTest.java b/crunch/src/test/java/org/apache/crunch/impl/mr/collect/DoTableImplTest.java
deleted file mode 100644
index 89b9944..0000000
--- a/crunch/src/test/java/org/apache/crunch/impl/mr/collect/DoTableImplTest.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.collect;
-
-import static org.apache.crunch.types.writable.Writables.strings;
-import static org.apache.crunch.types.writable.Writables.tableOf;
-import static org.junit.Assert.assertEquals;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.verifyNoMoreInteractions;
-import static org.mockito.Mockito.when;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.Pair;
-import org.junit.Test;
-
-public class DoTableImplTest {
-
- @Test
- public void testGetSizeInternal_NoScaleFactor() {
- runScaleTest(100L, 1.0f, 100L);
- }
-
- @Test
- public void testGetSizeInternal_ScaleFactorBelowZero() {
- runScaleTest(100L, 0.5f, 50L);
- }
-
- @Test
- public void testGetSizeInternal_ScaleFactorAboveZero() {
- runScaleTest(100L, 1.5f, 150L);
- }
-
- private void runScaleTest(long inputSize, float scaleFactor, long expectedScaledSize) {
-
- @SuppressWarnings("unchecked")
- PCollectionImpl<String> parentCollection = (PCollectionImpl<String>) mock(PCollectionImpl.class);
-
- when(parentCollection.getSize()).thenReturn(inputSize);
-
- DoTableImpl<String, String> doTableImpl = new DoTableImpl<String, String>("Scalled table collection",
- parentCollection, new TableScaledFunction(scaleFactor), tableOf(strings(), strings()));
-
- assertEquals(expectedScaledSize, doTableImpl.getSizeInternal());
-
- verify(parentCollection).getSize();
-
- verifyNoMoreInteractions(parentCollection);
- }
-
- static class TableScaledFunction extends DoFn<String, Pair<String, String>> {
-
- private float scaleFactor;
-
- public TableScaledFunction(float scaleFactor) {
- this.scaleFactor = scaleFactor;
- }
-
- @Override
- public float scaleFactor() {
- return scaleFactor;
- }
-
- @Override
- public void process(String input, Emitter<Pair<String, String>> emitter) {
- emitter.emit(Pair.of(input, input));
-
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/impl/mr/emit/IntermediateEmitterTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/impl/mr/emit/IntermediateEmitterTest.java b/crunch/src/test/java/org/apache/crunch/impl/mr/emit/IntermediateEmitterTest.java
deleted file mode 100644
index dd72364..0000000
--- a/crunch/src/test/java/org/apache/crunch/impl/mr/emit/IntermediateEmitterTest.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.emit;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertSame;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.spy;
-import static org.mockito.Mockito.verify;
-
-import org.apache.crunch.impl.mr.run.RTNode;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.hadoop.conf.Configuration;
-import org.junit.Before;
-import org.junit.Test;
-import org.mockito.ArgumentCaptor;
-
-import com.google.common.collect.Lists;
-
-public class IntermediateEmitterTest {
-
- private StringWrapper stringWrapper;
- private PType ptype;
-
- @Before
- public void setUp() {
- stringWrapper = new StringWrapper("test");
- ptype = spy(Avros.reflects(StringWrapper.class));
- }
-
- @Test
- public void testEmit_SingleChild() {
- RTNode singleChild = mock(RTNode.class);
- IntermediateEmitter emitter = new IntermediateEmitter(ptype, Lists.newArrayList(singleChild),
- new Configuration());
- emitter.emit(stringWrapper);
-
- ArgumentCaptor<StringWrapper> argumentCaptor = ArgumentCaptor.forClass(StringWrapper.class);
- verify(singleChild).process(argumentCaptor.capture());
- assertSame(stringWrapper, argumentCaptor.getValue());
- }
-
- @Test
- public void testEmit_MultipleChildren() {
- RTNode childA = mock(RTNode.class);
- RTNode childB = mock(RTNode.class);
- IntermediateEmitter emitter = new IntermediateEmitter(ptype, Lists.newArrayList(childA, childB),
- new Configuration());
- emitter.emit(stringWrapper);
-
- ArgumentCaptor<StringWrapper> argumentCaptorA = ArgumentCaptor.forClass(StringWrapper.class);
- ArgumentCaptor<StringWrapper> argumentCaptorB = ArgumentCaptor.forClass(StringWrapper.class);
-
- verify(childA).process(argumentCaptorA.capture());
- verify(childB).process(argumentCaptorB.capture());
-
- assertEquals(stringWrapper, argumentCaptorA.getValue());
- assertEquals(stringWrapper, argumentCaptorB.getValue());
-
- // Make sure that multiple children means deep copies are performed
- assertNotSame(stringWrapper, argumentCaptorA.getValue());
- assertNotSame(stringWrapper, argumentCaptorB.getValue());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounterTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounterTest.java b/crunch/src/test/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounterTest.java
deleted file mode 100644
index 958df12..0000000
--- a/crunch/src/test/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounterTest.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.exec;
-
-import static org.junit.Assert.assertEquals;
-
-import org.junit.Test;
-
-public class CappedExponentialCounterTest {
-
- @Test
- public void testGet() {
- CappedExponentialCounter c = new CappedExponentialCounter(1L, Long.MAX_VALUE);
- assertEquals(1L, c.get());
- assertEquals(2L, c.get());
- assertEquals(4L, c.get());
- assertEquals(8L, c.get());
- }
-
- @Test
- public void testCap() {
- CappedExponentialCounter c = new CappedExponentialCounter(1L, 2);
- assertEquals(1L, c.get());
- assertEquals(2L, c.get());
- assertEquals(2L, c.get());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/impl/mr/exec/CrunchJobHooksTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/impl/mr/exec/CrunchJobHooksTest.java b/crunch/src/test/java/org/apache/crunch/impl/mr/exec/CrunchJobHooksTest.java
deleted file mode 100644
index f03c3e2..0000000
--- a/crunch/src/test/java/org/apache/crunch/impl/mr/exec/CrunchJobHooksTest.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.exec;
-
-import static org.junit.Assert.assertEquals;
-
-import org.junit.Test;
-
-public class CrunchJobHooksTest {
-
- @Test
- public void testExtractPartitionNumber() {
- assertEquals(0, CrunchJobHooks.extractPartitionNumber("out1-r-00000"));
- assertEquals(10, CrunchJobHooks.extractPartitionNumber("out2-r-00010"));
- assertEquals(99999, CrunchJobHooks.extractPartitionNumber("out3-r-99999"));
- }
-
- @Test
- public void testExtractPartitionNumber_WithSuffix() {
- assertEquals(10, CrunchJobHooks.extractPartitionNumber("out2-r-00010.avro"));
- }
-
- @Test(expected = IllegalArgumentException.class)
- public void testExtractPartitionNumber_MapOutputFile() {
- CrunchJobHooks.extractPartitionNumber("out1-m-00000");
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/impl/mr/plan/DotfileWriterTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/impl/mr/plan/DotfileWriterTest.java b/crunch/src/test/java/org/apache/crunch/impl/mr/plan/DotfileWriterTest.java
deleted file mode 100644
index 562238d..0000000
--- a/crunch/src/test/java/org/apache/crunch/impl/mr/plan/DotfileWriterTest.java
+++ /dev/null
@@ -1,132 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.plan;
-
-import static org.junit.Assert.assertEquals;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.when;
-
-import java.util.List;
-
-import org.apache.crunch.Source;
-import org.apache.crunch.Target;
-import org.apache.crunch.impl.mr.collect.InputCollection;
-import org.apache.crunch.impl.mr.collect.PCollectionImpl;
-import org.apache.crunch.impl.mr.plan.DotfileWriter.MRTaskType;
-import org.junit.Before;
-import org.junit.Test;
-import org.mockito.Mockito;
-
-import com.google.common.collect.Lists;
-
-public class DotfileWriterTest {
-
- private DotfileWriter dotfileWriter;
-
- @Before
- public void setUp() {
- dotfileWriter = new DotfileWriter();
- }
-
- @Test
- public void testFormatPCollectionNodeDeclaration() {
- PCollectionImpl<?> pcollectionImpl = mock(PCollectionImpl.class);
- JobPrototype jobPrototype = mock(JobPrototype.class);
- when(pcollectionImpl.getName()).thenReturn("collection");
-
- assertEquals("\"collection@" + pcollectionImpl.hashCode() + "@" + jobPrototype.hashCode()
- + "\" [label=\"collection\" shape=box];",
- dotfileWriter.formatPCollectionNodeDeclaration(pcollectionImpl, jobPrototype));
- }
-
- @Test
- public void testFormatPCollectionNodeDeclaration_InputPCollection() {
- InputCollection<?> inputCollection = mock(InputCollection.class, Mockito.RETURNS_DEEP_STUBS);
- JobPrototype jobPrototype = mock(JobPrototype.class);
- when(inputCollection.getName()).thenReturn("input");
- when(inputCollection.getSource().toString()).thenReturn("source");
-
- assertEquals("\"source\" [label=\"input\" shape=folder];",
- dotfileWriter.formatPCollectionNodeDeclaration(inputCollection, jobPrototype));
- }
-
- @Test
- public void testFormatTargetNodeDeclaration() {
- Target target = mock(Target.class);
- when(target.toString()).thenReturn("target/path");
-
- assertEquals("\"target/path\" [label=\"target/path\" shape=folder];",
- dotfileWriter.formatTargetNodeDeclaration(target));
- }
-
- @Test
- public void testFormatPCollection() {
- PCollectionImpl<?> pcollectionImpl = mock(PCollectionImpl.class);
- JobPrototype jobPrototype = mock(JobPrototype.class);
- when(pcollectionImpl.getName()).thenReturn("collection");
-
- assertEquals("\"collection@" + pcollectionImpl.hashCode() + "@" + jobPrototype.hashCode() + "\"",
- dotfileWriter.formatPCollection(pcollectionImpl, jobPrototype));
- }
-
- @Test
- public void testFormatPCollection_InputCollection() {
- InputCollection<Object> inputCollection = mock(InputCollection.class);
- Source<Object> source = mock(Source.class);
- JobPrototype jobPrototype = mock(JobPrototype.class);
- when(source.toString()).thenReturn("mocksource");
- when(inputCollection.getSource()).thenReturn(source);
-
- assertEquals("\"mocksource\"", dotfileWriter.formatPCollection(inputCollection, jobPrototype));
- }
-
- @Test
- public void testFormatNodeCollection() {
- List<String> nodeCollection = Lists.newArrayList("one", "two", "three");
- assertEquals("one -> two -> three;", dotfileWriter.formatNodeCollection(nodeCollection));
- }
-
- @Test
- public void testFormatNodePath() {
- PCollectionImpl<?> tail = mock(PCollectionImpl.class);
- PCollectionImpl<?> head = mock(PCollectionImpl.class);
- JobPrototype jobPrototype = mock(JobPrototype.class);
-
- when(tail.getName()).thenReturn("tail");
- when(head.getName()).thenReturn("head");
-
- NodePath nodePath = new NodePath(tail);
- nodePath.close(head);
-
- assertEquals(
- Lists.newArrayList("\"head@" + head.hashCode() + "@" + jobPrototype.hashCode() + "\" -> \"tail@"
- + tail.hashCode() + "@" + jobPrototype.hashCode() + "\";"),
- dotfileWriter.formatNodePath(nodePath, jobPrototype));
- }
-
- @Test
- public void testGetTaskGraphAttributes_Map() {
- assertEquals("label = Map; color = blue;", dotfileWriter.getTaskGraphAttributes(MRTaskType.MAP));
- }
-
- @Test
- public void testGetTaskGraphAttributes_Reduce() {
- assertEquals("label = Reduce; color = red;", dotfileWriter.getTaskGraphAttributes(MRTaskType.REDUCE));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/impl/mr/plan/JobNameBuilderTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/impl/mr/plan/JobNameBuilderTest.java b/crunch/src/test/java/org/apache/crunch/impl/mr/plan/JobNameBuilderTest.java
deleted file mode 100644
index 7963c83..0000000
--- a/crunch/src/test/java/org/apache/crunch/impl/mr/plan/JobNameBuilderTest.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.plan;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.crunch.types.writable.Writables;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class JobNameBuilderTest {
-
- @Test
- public void testBuild() {
- final String pipelineName = "PipelineName";
- final String nodeName = "outputNode";
- DoNode doNode = DoNode.createOutputNode(nodeName, Writables.strings());
- JobNameBuilder jobNameBuilder = new JobNameBuilder(pipelineName);
- jobNameBuilder.visit(Lists.newArrayList(doNode));
- String jobName = jobNameBuilder.build();
-
- assertEquals(String.format("%s: %s", pipelineName, nodeName), jobName);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/io/SequentialFileNamingSchemeTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/io/SequentialFileNamingSchemeTest.java b/crunch/src/test/java/org/apache/crunch/io/SequentialFileNamingSchemeTest.java
deleted file mode 100644
index 467da15..0000000
--- a/crunch/src/test/java/org/apache/crunch/io/SequentialFileNamingSchemeTest.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.junit.Before;
-import org.junit.Rule;
-import org.junit.Test;
-import org.junit.rules.TemporaryFolder;
-
-public class SequentialFileNamingSchemeTest {
-
- // The partition id used for testing. This partition id should be ignored by
- // the SequentialFileNamingScheme.
- private static final int PARTITION_ID = 42;
-
- private SequentialFileNamingScheme namingScheme;
- private Configuration configuration;
-
- @Rule
- public TemporaryFolder tmpOutputDir = new TemporaryFolder();
-
- @Before
- public void setUp() throws IOException {
- configuration = new Configuration();
- namingScheme = new SequentialFileNamingScheme();
- }
-
- @Test
- public void testGetMapOutputName_EmptyDirectory() throws IOException {
- assertEquals("part-m-00000",
- namingScheme.getMapOutputName(configuration, new Path(tmpOutputDir.getRoot().getAbsolutePath())));
- }
-
- @Test
- public void testGetMapOutputName_NonEmptyDirectory() throws IOException {
- File outputDirectory = tmpOutputDir.getRoot();
-
- new File(outputDirectory, "existing-1").createNewFile();
- new File(outputDirectory, "existing-2").createNewFile();
-
- assertEquals("part-m-00002",
- namingScheme.getMapOutputName(configuration, new Path(outputDirectory.getAbsolutePath())));
- }
-
- @Test
- public void testGetReduceOutputName_EmptyDirectory() throws IOException {
- assertEquals("part-r-00000", namingScheme.getReduceOutputName(configuration, new Path(tmpOutputDir.getRoot()
- .getAbsolutePath()), PARTITION_ID));
- }
-
- @Test
- public void testGetReduceOutputName_NonEmptyDirectory() throws IOException {
- File outputDirectory = tmpOutputDir.getRoot();
-
- new File(outputDirectory, "existing-1").createNewFile();
- new File(outputDirectory, "existing-2").createNewFile();
-
- assertEquals("part-r-00002",
- namingScheme.getReduceOutputName(configuration, new Path(outputDirectory.getAbsolutePath()), PARTITION_ID));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/io/SourceTargetHelperTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/io/SourceTargetHelperTest.java b/crunch/src/test/java/org/apache/crunch/io/SourceTargetHelperTest.java
deleted file mode 100644
index 5b0ea55..0000000
--- a/crunch/src/test/java/org/apache/crunch/io/SourceTargetHelperTest.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocalFileSystem;
-import org.apache.hadoop.fs.Path;
-import org.junit.Test;
-
-public class SourceTargetHelperTest {
-
- @Test
- public void testGetNonexistentPathSize() throws Exception {
- File tmp = File.createTempFile("pathsize", "");
- Path tmpPath = new Path(tmp.getAbsolutePath());
- tmp.delete();
- FileSystem fs = FileSystem.getLocal(new Configuration());
- assertEquals(-1L, SourceTargetHelper.getPathSize(fs, tmpPath));
- }
-
- @Test
- public void testGetNonExistentPathSize_NonExistantPath() throws IOException {
- FileSystem mockFs = new MockFileSystem();
- assertEquals(-1L, SourceTargetHelper.getPathSize(mockFs, new Path("does/not/exist")));
- }
-
- /**
- * Mock FileSystem that returns null for {@link FileSystem#listStatus(Path)}.
- */
- static class MockFileSystem extends LocalFileSystem {
-
- @Override
- public FileStatus[] listStatus(Path f) throws IOException {
- return null;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/io/avro/AvroFileReaderFactoryTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/io/avro/AvroFileReaderFactoryTest.java b/crunch/src/test/java/org/apache/crunch/io/avro/AvroFileReaderFactoryTest.java
deleted file mode 100644
index 62085f8..0000000
--- a/crunch/src/test/java/org/apache/crunch/io/avro/AvroFileReaderFactoryTest.java
+++ /dev/null
@@ -1,184 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.avro;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.avro.Schema;
-import org.apache.avro.file.DataFileWriter;
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.generic.GenericData.Record;
-import org.apache.avro.generic.GenericDatumReader;
-import org.apache.avro.generic.GenericDatumWriter;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.avro.io.DatumReader;
-import org.apache.avro.reflect.ReflectData;
-import org.apache.avro.reflect.ReflectDatumReader;
-import org.apache.avro.specific.SpecificDatumReader;
-import org.apache.crunch.Pair;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.types.avro.AvroType;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.junit.After;
-import org.junit.Assume;
-import org.junit.Before;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class AvroFileReaderFactoryTest {
-
- private File avroFile;
-
- @Before
- public void setUp() throws IOException {
- avroFile = File.createTempFile("test", ".av");
- }
-
- @After
- public void tearDown() {
- avroFile.delete();
- }
-
- private void populateGenericFile(List<GenericRecord> genericRecords, Schema outputSchema) throws IOException {
- FileOutputStream outputStream = new FileOutputStream(this.avroFile);
- GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<GenericRecord>(outputSchema);
-
- DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(genericDatumWriter);
- dataFileWriter.create(outputSchema, outputStream);
-
- for (GenericRecord record : genericRecords) {
- dataFileWriter.append(record);
- }
-
- dataFileWriter.close();
- outputStream.close();
-
- }
-
- private <T> AvroFileReaderFactory<T> createFileReaderFactory(AvroType<T> avroType) {
- return new AvroFileReaderFactory<T>(avroType);
- }
-
- @Test
- public void testRead_GenericReader() throws IOException {
- GenericRecord savedRecord = new GenericData.Record(Person.SCHEMA$);
- savedRecord.put("name", "John Doe");
- savedRecord.put("age", 42);
- savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
- populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);
-
- AvroFileReaderFactory<GenericData.Record> genericReader = createFileReaderFactory(Avros.generics(Person.SCHEMA$));
- Iterator<GenericData.Record> recordIterator = genericReader.read(FileSystem.getLocal(new Configuration()),
- new Path(this.avroFile.getAbsolutePath()));
-
- GenericRecord genericRecord = recordIterator.next();
- assertEquals(savedRecord, genericRecord);
- assertFalse(recordIterator.hasNext());
- }
-
- @Test
- public void testRead_SpecificReader() throws IOException {
- GenericRecord savedRecord = new GenericData.Record(Person.SCHEMA$);
- savedRecord.put("name", "John Doe");
- savedRecord.put("age", 42);
- savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
- populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);
-
- AvroFileReaderFactory<Person> genericReader = createFileReaderFactory(Avros.records(Person.class));
- Iterator<Person> recordIterator = genericReader.read(FileSystem.getLocal(new Configuration()), new Path(
- this.avroFile.getAbsolutePath()));
-
- Person expectedPerson = new Person();
- expectedPerson.age = 42;
- expectedPerson.name = "John Doe";
- List<CharSequence> siblingNames = Lists.newArrayList();
- siblingNames.add("Jimmy");
- siblingNames.add("Jane");
- expectedPerson.siblingnames = siblingNames;
-
- Person person = recordIterator.next();
-
- assertEquals(expectedPerson, person);
- assertFalse(recordIterator.hasNext());
- }
-
- @Test
- public void testRead_ReflectReader() throws IOException {
- Schema reflectSchema = ReflectData.get().getSchema(StringWrapper.class);
- GenericRecord savedRecord = new GenericData.Record(reflectSchema);
- savedRecord.put("value", "stringvalue");
- populateGenericFile(Lists.newArrayList(savedRecord), reflectSchema);
-
- AvroFileReaderFactory<StringWrapper> genericReader = createFileReaderFactory(Avros.reflects(StringWrapper.class));
- Iterator<StringWrapper> recordIterator = genericReader.read(FileSystem.getLocal(new Configuration()), new Path(
- this.avroFile.getAbsolutePath()));
-
- StringWrapper stringWrapper = recordIterator.next();
-
- assertEquals("stringvalue", stringWrapper.getValue());
- assertFalse(recordIterator.hasNext());
- }
-
- @Test
- public void testCreateDatumReader_Generic() {
- DatumReader<Record> datumReader = AvroFileReaderFactory.createDatumReader(Avros.generics(Person.SCHEMA$));
- assertEquals(GenericDatumReader.class, datumReader.getClass());
- }
-
- @Test
- public void testCreateDatumReader_Reflect() {
- DatumReader<StringWrapper> datumReader = AvroFileReaderFactory.createDatumReader(Avros
- .reflects(StringWrapper.class));
- assertEquals(ReflectDatumReader.class, datumReader.getClass());
- }
-
- @Test
- public void testCreateDatumReader_Specific() {
- DatumReader<Person> datumReader = AvroFileReaderFactory.createDatumReader(Avros.records(Person.class));
- assertEquals(SpecificDatumReader.class, datumReader.getClass());
- }
-
- @Test
- public void testCreateDatumReader_ReflectAndSpecific() {
- Assume.assumeTrue(Avros.CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS);
-
- DatumReader<Pair<Person, StringWrapper>> datumReader = AvroFileReaderFactory.createDatumReader(Avros.pairs(
- Avros.records(Person.class), Avros.reflects(StringWrapper.class)));
- assertEquals(ReflectDatumReader.class, datumReader.getClass());
- }
-
- @Test(expected = IllegalStateException.class)
- public void testCreateDatumReader_ReflectAndSpecific_NotSupported() {
- Assume.assumeTrue(!Avros.CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS);
- AvroFileReaderFactory.createDatumReader(Avros.pairs(Avros.records(Person.class),
- Avros.reflects(StringWrapper.class)));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/io/avro/AvroFileSourceTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/io/avro/AvroFileSourceTest.java b/crunch/src/test/java/org/apache/crunch/io/avro/AvroFileSourceTest.java
deleted file mode 100644
index ceef2b2..0000000
--- a/crunch/src/test/java/org/apache/crunch/io/avro/AvroFileSourceTest.java
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.avro;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.avro.generic.GenericData.Record;
-import org.apache.avro.mapred.AvroJob;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.types.avro.AvroType;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-
-public class AvroFileSourceTest {
-
- private Job job;
- File tempFile;
-
- @Before
- public void setUp() throws IOException {
- job = new Job();
- tempFile = File.createTempFile("test", ".avr");
- }
-
- @After
- public void tearDown() {
- tempFile.delete();
- }
-
- @Test
- public void testConfigureJob_SpecificData() throws IOException {
- AvroType<Person> avroSpecificType = Avros.records(Person.class);
- AvroFileSource<Person> personFileSource = new AvroFileSource<Person>(new Path(tempFile.getAbsolutePath()),
- avroSpecificType);
-
- personFileSource.configureSource(job, -1);
-
- assertFalse(job.getConfiguration().getBoolean(AvroJob.INPUT_IS_REFLECT, true));
- assertEquals(Person.SCHEMA$.toString(), job.getConfiguration().get(AvroJob.INPUT_SCHEMA));
- }
-
- @Test
- public void testConfigureJob_GenericData() throws IOException {
- AvroType<Record> avroGenericType = Avros.generics(Person.SCHEMA$);
- AvroFileSource<Record> personFileSource = new AvroFileSource<Record>(new Path(tempFile.getAbsolutePath()),
- avroGenericType);
-
- personFileSource.configureSource(job, -1);
-
- assertFalse(job.getConfiguration().getBoolean(AvroJob.INPUT_IS_REFLECT, true));
-
- }
-
- @Test
- public void testConfigureJob_ReflectData() throws IOException {
- AvroType<StringWrapper> avroReflectType = Avros.reflects(StringWrapper.class);
- AvroFileSource<StringWrapper> personFileSource = new AvroFileSource<StringWrapper>(new Path(
- tempFile.getAbsolutePath()), avroReflectType);
-
- personFileSource.configureSource(job, -1);
-
- assertTrue(job.getConfiguration().getBoolean(AvroJob.INPUT_IS_REFLECT, false));
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/lib/AvroIndexedRecordPartitionerTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/lib/AvroIndexedRecordPartitionerTest.java b/crunch/src/test/java/org/apache/crunch/lib/AvroIndexedRecordPartitionerTest.java
deleted file mode 100644
index 0dfed32..0000000
--- a/crunch/src/test/java/org/apache/crunch/lib/AvroIndexedRecordPartitionerTest.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.avro.Schema;
-import org.apache.avro.generic.IndexedRecord;
-import org.apache.avro.mapred.AvroKey;
-import org.apache.avro.mapred.AvroValue;
-import org.apache.crunch.lib.join.JoinUtils.AvroIndexedRecordPartitioner;
-import org.junit.Before;
-import org.junit.Test;
-
-public class AvroIndexedRecordPartitionerTest {
-
- private AvroIndexedRecordPartitioner avroPartitioner;
-
- @Before
- public void setUp() {
- avroPartitioner = new AvroIndexedRecordPartitioner();
- }
-
- @Test
- public void testGetPartition() {
- IndexedRecord indexedRecord = new MockIndexedRecord(3);
- AvroKey<IndexedRecord> avroKey = new AvroKey<IndexedRecord>(indexedRecord);
-
- assertEquals(3, avroPartitioner.getPartition(avroKey, new AvroValue<Object>(), 5));
- assertEquals(1, avroPartitioner.getPartition(avroKey, new AvroValue<Object>(), 2));
- }
-
- @Test
- public void testGetPartition_NegativeHashValue() {
- IndexedRecord indexedRecord = new MockIndexedRecord(-3);
- AvroKey<IndexedRecord> avroKey = new AvroKey<IndexedRecord>(indexedRecord);
-
- assertEquals(3, avroPartitioner.getPartition(avroKey, new AvroValue<Object>(), 5));
- assertEquals(1, avroPartitioner.getPartition(avroKey, new AvroValue<Object>(), 2));
- }
-
- @Test
- public void testGetPartition_IntegerMinValue() {
- IndexedRecord indexedRecord = new MockIndexedRecord(Integer.MIN_VALUE);
- AvroKey<IndexedRecord> avroKey = new AvroKey<IndexedRecord>(indexedRecord);
-
- assertEquals(0, avroPartitioner.getPartition(avroKey, new AvroValue<Object>(), Integer.MAX_VALUE));
- }
-
- /**
- * Mock implementation of IndexedRecord to give us control over the hashCode.
- */
- static class MockIndexedRecord implements IndexedRecord {
-
- private Integer value;
-
- public MockIndexedRecord(Integer value) {
- this.value = value;
- }
-
- @Override
- public int hashCode() {
- return value.hashCode();
- }
-
- @Override
- public Schema getSchema() {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public Object get(int arg0) {
- return this.value;
- }
-
- @Override
- public void put(int arg0, Object arg1) {
- throw new UnsupportedOperationException();
- }
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/lib/CartesianTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/lib/CartesianTest.java b/crunch/src/test/java/org/apache/crunch/lib/CartesianTest.java
deleted file mode 100644
index b19097c..0000000
--- a/crunch/src/test/java/org/apache/crunch/lib/CartesianTest.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static org.junit.Assert.assertEquals;
-
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.types.writable.Writables;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class CartesianTest {
-
- @Test
- public void testCartesianCollection_SingleValues() {
-
- PCollection<String> letters = MemPipeline.typedCollectionOf(Writables.strings(), "a", "b");
- PCollection<Integer> ints = MemPipeline.typedCollectionOf(Writables.ints(), 1, 2);
-
- PCollection<Pair<String, Integer>> cartesianProduct = Cartesian.cross(letters, ints);
-
- @SuppressWarnings("unchecked")
- List<Pair<String, Integer>> expectedResults = Lists.newArrayList(Pair.of("a", 1), Pair.of("a", 2), Pair.of("b", 1),
- Pair.of("b", 2));
- List<Pair<String, Integer>> actualResults = Lists.newArrayList(cartesianProduct.materialize());
- Collections.sort(actualResults);
-
- assertEquals(expectedResults, actualResults);
- }
-
- @Test
- public void testCartesianCollection_Tables() {
-
- PTable<String, Integer> leftTable = MemPipeline.typedTableOf(
- Writables.tableOf(Writables.strings(), Writables.ints()), "a", 1, "b", 2);
- PTable<String, Float> rightTable = MemPipeline.typedTableOf(
- Writables.tableOf(Writables.strings(), Writables.floats()), "A", 1.0f, "B", 2.0f);
-
- PTable<Pair<String, String>, Pair<Integer, Float>> cartesianProduct = Cartesian.cross(leftTable, rightTable);
-
- List<Pair<Pair<String, String>, Pair<Integer, Float>>> expectedResults = Lists.newArrayList();
- expectedResults.add(Pair.of(Pair.of("a", "A"), Pair.of(1, 1.0f)));
- expectedResults.add(Pair.of(Pair.of("a", "B"), Pair.of(1, 2.0f)));
- expectedResults.add(Pair.of(Pair.of("b", "A"), Pair.of(2, 1.0f)));
- expectedResults.add(Pair.of(Pair.of("b", "B"), Pair.of(2, 2.0f)));
-
- List<Pair<Pair<String, String>, Pair<Integer, Float>>> actualResults = Lists.newArrayList(cartesianProduct
- .materialize());
- Collections.sort(actualResults);
-
- assertEquals(expectedResults, actualResults);
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/lib/DistinctTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/lib/DistinctTest.java b/crunch/src/test/java/org/apache/crunch/lib/DistinctTest.java
deleted file mode 100644
index 8c0b3bf..0000000
--- a/crunch/src/test/java/org/apache/crunch/lib/DistinctTest.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static org.junit.Assert.assertEquals;
-
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.crunch.PCollection;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.types.avro.Avros;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableSet;
-
-public class DistinctTest {
- private static final List<Integer> DATA = Arrays.asList(
- 17, 29, 17, 29, 17, 29, 36, 45, 17, 45, 36, 29
- );
-
- @Test
- public void testDistinct() {
- PCollection<Integer> input = MemPipeline.typedCollectionOf(Avros.ints(), DATA);
- Iterable<Integer> unique = Distinct.distinct(input).materialize();
-
- assertEquals(ImmutableSet.copyOf(DATA), ImmutableSet.copyOf(unique));
- }
-
- @Test
- public void testDistinctFlush() {
- PCollection<Integer> input = MemPipeline.typedCollectionOf(Avros.ints(), DATA);
- Iterable<Integer> unique = Distinct.distinct(input, 2).materialize();
-
- assertEquals(ImmutableSet.copyOf(DATA), ImmutableSet.copyOf(unique));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/lib/SampleTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/lib/SampleTest.java b/crunch/src/test/java/org/apache/crunch/lib/SampleTest.java
deleted file mode 100644
index bd6fd81..0000000
--- a/crunch/src/test/java/org/apache/crunch/lib/SampleTest.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static org.junit.Assert.assertEquals;
-
-import java.util.List;
-import java.util.Map;
-
-import org.apache.crunch.PCollection;
-import org.apache.crunch.Pair;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.types.writable.Writables;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Maps;
-
-public class SampleTest {
- private PCollection<Pair<String, Double>> values = MemPipeline.typedCollectionOf(
- Writables.pairs(Writables.strings(), Writables.doubles()),
- ImmutableList.of(
- Pair.of("foo", 200.0),
- Pair.of("bar", 400.0),
- Pair.of("baz", 100.0),
- Pair.of("biz", 100.0)));
-
- @Test
- public void testWRS() throws Exception {
- Map<String, Integer> histogram = Maps.newHashMap();
-
- for (int i = 0; i < 100; i++) {
- PCollection<String> sample = Sample.weightedReservoirSample(values, 1, 1729L + i);
- for (String s : sample.materialize()) {
- if (!histogram.containsKey(s)) {
- histogram.put(s, 1);
- } else {
- histogram.put(s, 1 + histogram.get(s));
- }
- }
- }
-
- Map<String, Integer> expected = ImmutableMap.of(
- "foo", 24, "bar", 51, "baz", 13, "biz", 12);
- assertEquals(expected, histogram);
- }
-
- @Test
- public void testSample() {
- PCollection<Integer> pcollect = MemPipeline.collectionOf(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
- Iterable<Integer> sample = Sample.sample(pcollect, 123998L, 0.2).materialize();
- List<Integer> sampleValues = ImmutableList.copyOf(sample);
- assertEquals(ImmutableList.of(6, 7), sampleValues);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/lib/SecondarySortTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/lib/SecondarySortTest.java b/crunch/src/test/java/org/apache/crunch/lib/SecondarySortTest.java
deleted file mode 100644
index 933b986..0000000
--- a/crunch/src/test/java/org/apache/crunch/lib/SecondarySortTest.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static org.apache.crunch.types.avro.Avros.*;
-import static org.junit.Assert.assertEquals;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableList;
-
-
-public class SecondarySortTest {
- @Test
- public void testInMemory() throws Exception {
- PTable<Long, Pair<Long, String>> input = MemPipeline.typedTableOf(tableOf(longs(), pairs(longs(), strings())),
- 1729L, Pair.of(17L, "a"), 100L, Pair.of(29L, "b"), 1729L, Pair.of(29L, "c"));
- PCollection<String> letters = SecondarySort.sortAndApply(input, new StringifyFn(), strings());
- assertEquals(ImmutableList.of("b", "ac"), letters.materialize());
- }
-
- private static class StringifyFn extends DoFn<Pair<Long, Iterable<Pair<Long, String>>>, String> {
- @Override
- public void process(Pair<Long, Iterable<Pair<Long, String>>> input, Emitter<String> emitter) {
- StringBuilder sb = new StringBuilder();
- for (Pair<Long, String> p : input.second()) {
- sb.append(p.second());
- }
- emitter.emit(sb.toString());
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/lib/TupleWritablePartitionerTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/lib/TupleWritablePartitionerTest.java b/crunch/src/test/java/org/apache/crunch/lib/TupleWritablePartitionerTest.java
deleted file mode 100644
index 35ccc11..0000000
--- a/crunch/src/test/java/org/apache/crunch/lib/TupleWritablePartitionerTest.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.crunch.lib.join.JoinUtils.TupleWritablePartitioner;
-import org.apache.crunch.types.writable.TupleWritable;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.Writable;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TupleWritablePartitionerTest {
-
- private TupleWritablePartitioner tupleWritableParitioner;
-
- @Before
- public void setUp() {
- tupleWritableParitioner = new TupleWritablePartitioner();
- }
-
- @Test
- public void testGetPartition() {
- IntWritable intWritable = new IntWritable(3);
- TupleWritable key = new TupleWritable(new Writable[] { intWritable });
- assertEquals(3, tupleWritableParitioner.getPartition(key, NullWritable.get(), 5));
- assertEquals(1, tupleWritableParitioner.getPartition(key, NullWritable.get(), 2));
- }
-
- @Test
- public void testGetPartition_NegativeHashValue() {
- IntWritable intWritable = new IntWritable(-3);
- // Sanity check, if this doesn't work then the premise of this test is wrong
- assertEquals(-3, intWritable.hashCode());
-
- TupleWritable key = new TupleWritable(new Writable[] { intWritable });
- assertEquals(3, tupleWritableParitioner.getPartition(key, NullWritable.get(), 5));
- assertEquals(1, tupleWritableParitioner.getPartition(key, NullWritable.get(), 2));
- }
-
- @Test
- public void testGetPartition_IntegerMinValue() {
- IntWritable intWritable = new IntWritable(Integer.MIN_VALUE);
- // Sanity check, if this doesn't work then the premise of this test is wrong
- assertEquals(Integer.MIN_VALUE, intWritable.hashCode());
-
- TupleWritable key = new TupleWritable(new Writable[] { intWritable });
- assertEquals(0, tupleWritableParitioner.getPartition(key, NullWritable.get(), Integer.MAX_VALUE));
- }
-
-}
[02/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/lib/join/BrokenLeftAndOuterJoinTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/lib/join/BrokenLeftAndOuterJoinTest.java b/crunch/src/test/java/org/apache/crunch/lib/join/BrokenLeftAndOuterJoinTest.java
deleted file mode 100644
index 7e2e444..0000000
--- a/crunch/src/test/java/org/apache/crunch/lib/join/BrokenLeftAndOuterJoinTest.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import static org.apache.crunch.test.StringWrapper.wrap;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.verifyNoMoreInteractions;
-
-import java.util.List;
-
-import org.apache.crunch.Emitter;
-import org.apache.crunch.Pair;
-import org.apache.crunch.test.CrunchTestSupport;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.hadoop.conf.Configuration;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class BrokenLeftAndOuterJoinTest {
-
- List<Pair<StringWrapper, String>> createValuePairList(StringWrapper leftValue, String rightValue) {
- Pair<StringWrapper, String> valuePair = Pair.of(leftValue, rightValue);
- List<Pair<StringWrapper, String>> valuePairList = Lists.newArrayList();
- valuePairList.add(valuePair);
- return valuePairList;
- }
-
- @Test
- public void testOuterJoin() {
- JoinFn<StringWrapper, StringWrapper, String> joinFn = new LeftOuterJoinFn<StringWrapper, StringWrapper, String>(
- Avros.reflects(StringWrapper.class),
- Avros.reflects(StringWrapper.class));
- joinFn.setContext(CrunchTestSupport.getTestContext(new Configuration()));
- joinFn.initialize();
- Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> emitter = mock(Emitter.class);
-
- StringWrapper key = new StringWrapper();
- StringWrapper leftValue = new StringWrapper();
- key.setValue("left-only");
- leftValue.setValue("left-only-left");
- joinFn.join(key, 0, createValuePairList(leftValue, null), emitter);
-
- key.setValue("right-only");
- joinFn.join(key, 1, createValuePairList(null, "right-only-right"), emitter);
-
- verify(emitter).emit(Pair.of(wrap("left-only"), Pair.of(wrap("left-only-left"), (String) null)));
- verifyNoMoreInteractions(emitter);
- }
-
- @Test
- public void testFullJoin() {
- JoinFn<StringWrapper, StringWrapper, String> joinFn = new FullOuterJoinFn<StringWrapper, StringWrapper, String>(
- Avros.reflects(StringWrapper.class),
- Avros.reflects(StringWrapper.class));
- joinFn.setContext(CrunchTestSupport.getTestContext(new Configuration()));
- joinFn.initialize();
- Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> emitter = mock(Emitter.class);
-
- StringWrapper key = new StringWrapper();
- StringWrapper leftValue = new StringWrapper();
- key.setValue("left-only");
- leftValue.setValue("left-only-left");
- joinFn.join(key, 0, createValuePairList(leftValue, null), emitter);
-
- key.setValue("right-only");
- joinFn.join(key, 1, createValuePairList(null, "right-only-right"), emitter);
-
- verify(emitter).emit(Pair.of(wrap("left-only"), Pair.of(wrap("left-only-left"), (String) null)));
- verify(emitter).emit(Pair.of(wrap("right-only"), Pair.of((StringWrapper)null, "right-only-right")));
- verifyNoMoreInteractions(emitter);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/lib/join/FullOuterJoinFnTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/lib/join/FullOuterJoinFnTest.java b/crunch/src/test/java/org/apache/crunch/lib/join/FullOuterJoinFnTest.java
deleted file mode 100644
index 5cf4f51..0000000
--- a/crunch/src/test/java/org/apache/crunch/lib/join/FullOuterJoinFnTest.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import static org.apache.crunch.test.StringWrapper.wrap;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.verifyNoMoreInteractions;
-
-import org.apache.crunch.Emitter;
-import org.apache.crunch.Pair;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.types.avro.Avros;
-
-public class FullOuterJoinFnTest extends JoinFnTestBase {
-
- @Override
- protected void checkOutput(Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> emitter) {
- verify(emitter)
- .emit(Pair.of(wrap("left-only"), Pair.of(wrap("left-only-left"), (String) null)));
- verify(emitter).emit(Pair.of(wrap("both"), Pair.of(wrap("both-left"), "both-right")));
- verify(emitter).emit(
- Pair.of(wrap("right-only"), Pair.of((StringWrapper) null, "right-only-right")));
- verifyNoMoreInteractions(emitter);
- }
-
- @Override
- protected JoinFn<StringWrapper, StringWrapper, String> getJoinFn() {
- return new FullOuterJoinFn<StringWrapper, StringWrapper, String>(
- Avros.reflects(StringWrapper.class),
- Avros.reflects(StringWrapper.class));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/lib/join/InnerJoinFnTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/lib/join/InnerJoinFnTest.java b/crunch/src/test/java/org/apache/crunch/lib/join/InnerJoinFnTest.java
deleted file mode 100644
index d2347de..0000000
--- a/crunch/src/test/java/org/apache/crunch/lib/join/InnerJoinFnTest.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import static org.apache.crunch.test.StringWrapper.wrap;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.verifyNoMoreInteractions;
-
-import org.apache.crunch.Emitter;
-import org.apache.crunch.Pair;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.types.avro.Avros;
-
-public class InnerJoinFnTest extends JoinFnTestBase {
-
- protected void checkOutput(Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> joinEmitter) {
- verify(joinEmitter).emit(Pair.of(wrap("both"), Pair.of(wrap("both-left"), "both-right")));
- verifyNoMoreInteractions(joinEmitter);
- }
-
- @Override
- protected JoinFn<StringWrapper, StringWrapper, String> getJoinFn() {
- return new InnerJoinFn<StringWrapper, StringWrapper, String>(
- Avros.reflects(StringWrapper.class),
- Avros.reflects(StringWrapper.class));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/lib/join/JoinFnTestBase.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/lib/join/JoinFnTestBase.java b/crunch/src/test/java/org/apache/crunch/lib/join/JoinFnTestBase.java
deleted file mode 100644
index 9e4337f..0000000
--- a/crunch/src/test/java/org/apache/crunch/lib/join/JoinFnTestBase.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import static org.mockito.Mockito.mock;
-
-import java.util.List;
-
-import org.apache.crunch.Emitter;
-import org.apache.crunch.Pair;
-import org.apache.crunch.test.CrunchTestSupport;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.hadoop.conf.Configuration;
-import org.junit.Before;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public abstract class JoinFnTestBase {
-
- private JoinFn<StringWrapper, StringWrapper, String> joinFn;
-
- private Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> emitter;
-
- // Avoid warnings on generic Emitter mock
- @SuppressWarnings("unchecked")
- @Before
- public void setUp() {
- joinFn = getJoinFn();
- joinFn.setContext(CrunchTestSupport.getTestContext(new Configuration()));
- joinFn.initialize();
- emitter = mock(Emitter.class);
- }
-
- @Test
- public void testJoin() {
-
- StringWrapper key = new StringWrapper();
- StringWrapper leftValue = new StringWrapper();
- key.setValue("left-only");
- leftValue.setValue("left-only-left");
- joinFn.join(key, 0, createValuePairList(leftValue, null), emitter);
-
- key.setValue("both");
- leftValue.setValue("both-left");
- joinFn.join(key, 0, createValuePairList(leftValue, null), emitter);
- joinFn.join(key, 1, createValuePairList(null, "both-right"), emitter);
-
- key.setValue("right-only");
- joinFn.join(key, 1, createValuePairList(null, "right-only-right"), emitter);
-
- checkOutput(emitter);
-
- }
-
- protected abstract void checkOutput(Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> emitter);
-
- protected abstract JoinFn<StringWrapper, StringWrapper, String> getJoinFn();
-
- protected List<Pair<StringWrapper, String>> createValuePairList(StringWrapper leftValue, String rightValue) {
- Pair<StringWrapper, String> valuePair = Pair.of(leftValue, rightValue);
- List<Pair<StringWrapper, String>> valuePairList = Lists.newArrayList();
- valuePairList.add(valuePair);
- return valuePairList;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/lib/join/LeftOuterJoinTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/lib/join/LeftOuterJoinTest.java b/crunch/src/test/java/org/apache/crunch/lib/join/LeftOuterJoinTest.java
deleted file mode 100644
index a90457e..0000000
--- a/crunch/src/test/java/org/apache/crunch/lib/join/LeftOuterJoinTest.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import static org.apache.crunch.test.StringWrapper.wrap;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.verifyNoMoreInteractions;
-
-import org.apache.crunch.Emitter;
-import org.apache.crunch.Pair;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.types.avro.Avros;
-
-public class LeftOuterJoinTest extends JoinFnTestBase {
-
- @Override
- protected void checkOutput(Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> emitter) {
- verify(emitter)
- .emit(Pair.of(wrap("left-only"), Pair.of(wrap("left-only-left"), (String) null)));
- verify(emitter).emit(Pair.of(wrap("both"), Pair.of(wrap("both-left"), "both-right")));
- verifyNoMoreInteractions(emitter);
- }
-
- @Override
- protected JoinFn<StringWrapper, StringWrapper, String> getJoinFn() {
- return new LeftOuterJoinFn<StringWrapper, StringWrapper, String>(
- Avros.reflects(StringWrapper.class),
- Avros.reflects(StringWrapper.class));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/lib/join/RightOuterJoinFnTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/lib/join/RightOuterJoinFnTest.java b/crunch/src/test/java/org/apache/crunch/lib/join/RightOuterJoinFnTest.java
deleted file mode 100644
index 7e41284..0000000
--- a/crunch/src/test/java/org/apache/crunch/lib/join/RightOuterJoinFnTest.java
+++ /dev/null
@@ -1,46 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import static org.apache.crunch.test.StringWrapper.wrap;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.verifyNoMoreInteractions;
-
-import org.apache.crunch.Emitter;
-import org.apache.crunch.Pair;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.types.avro.Avros;
-
-public class RightOuterJoinFnTest extends JoinFnTestBase {
-
- @Override
- protected void checkOutput(Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> emitter) {
- verify(emitter).emit(Pair.of(wrap("both"), Pair.of(wrap("both-left"), "both-right")));
- verify(emitter).emit(
- Pair.of(wrap("right-only"), Pair.of((StringWrapper) null, "right-only-right")));
- verifyNoMoreInteractions(emitter);
- }
-
- @Override
- protected JoinFn<StringWrapper, StringWrapper, String> getJoinFn() {
- return new RightOuterJoinFn<StringWrapper, StringWrapper, String>(
- Avros.reflects(StringWrapper.class),
- Avros.reflects(StringWrapper.class));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/test/CountersTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/test/CountersTest.java b/crunch/src/test/java/org/apache/crunch/test/CountersTest.java
deleted file mode 100644
index 66f854e..0000000
--- a/crunch/src/test/java/org/apache/crunch/test/CountersTest.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.test;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.hadoop.conf.Configuration;
-import org.junit.Test;
-
-/**
- * A test to verify using counters inside of a unit test works. :)
- */
-public class CountersTest {
-
- public enum CT {
- ONE,
- TWO,
- THREE
- };
-
- public static class CTFn extends DoFn<String, String> {
- CTFn() {
- setContext(CrunchTestSupport.getTestContext(new Configuration()));
- }
-
- @Override
- public void process(String input, Emitter<String> emitter) {
- getCounter(CT.ONE).increment(1);
- getCounter(CT.TWO).increment(4);
- getCounter(CT.THREE).increment(7);
- }
- }
-
- @Test
- public void test() {
- CTFn fn = new CTFn();
- fn.process("foo", null);
- fn.process("bar", null);
- assertEquals(2L, TestCounters.getCounter(CT.ONE).getValue());
- assertEquals(8L, TestCounters.getCounter(CT.TWO).getValue());
- assertEquals(14L, TestCounters.getCounter(CT.THREE).getValue());
- }
-
- @Test
- public void secondTest() {
- CTFn fn = new CTFn();
- fn.process("foo", null);
- fn.process("bar", null);
- assertEquals(2L, TestCounters.getCounter(CT.ONE).getValue());
- assertEquals(8L, TestCounters.getCounter(CT.TWO).getValue());
- assertEquals(14L, TestCounters.getCounter(CT.THREE).getValue());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/test/StringWrapper.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/test/StringWrapper.java b/crunch/src/test/java/org/apache/crunch/test/StringWrapper.java
deleted file mode 100644
index 34302b5..0000000
--- a/crunch/src/test/java/org/apache/crunch/test/StringWrapper.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.test;
-
-import org.apache.crunch.MapFn;
-
-/**
- * Simple String wrapper for testing with Avro reflection.
- */
-public class StringWrapper implements Comparable<StringWrapper> {
-
- public static class StringToStringWrapperMapFn extends MapFn<String, StringWrapper> {
-
- @Override
- public StringWrapper map(String input) {
- return wrap(input);
- }
-
- }
-
- public static class StringWrapperToStringMapFn extends MapFn<StringWrapper, String> {
-
- @Override
- public String map(StringWrapper input) {
- return input.getValue();
- }
-
- }
-
- private String value;
-
- public StringWrapper() {
- this("");
- }
-
- public StringWrapper(String value) {
- this.value = value;
- }
-
- @Override
- public int compareTo(StringWrapper o) {
- return this.value.compareTo(o.value);
- }
-
- public String getValue() {
- return value;
- }
-
- public void setValue(String value) {
- this.value = value;
- }
-
- @Override
- public int hashCode() {
- final int prime = 31;
- int result = 1;
- result = prime * result + ((value == null) ? 0 : value.hashCode());
- return result;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
- return false;
- StringWrapper other = (StringWrapper) obj;
- if (value == null) {
- if (other.value != null)
- return false;
- } else if (!value.equals(other.value))
- return false;
- return true;
- }
-
- @Override
- public String toString() {
- return "StringWrapper [value=" + value + "]";
- }
-
- public static StringWrapper wrap(String value) {
- return new StringWrapper(value);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/CollectionDeepCopierTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/CollectionDeepCopierTest.java b/crunch/src/test/java/org/apache/crunch/types/CollectionDeepCopierTest.java
deleted file mode 100644
index bd7fcd7..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/CollectionDeepCopierTest.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertNull;
-
-import java.util.Collection;
-
-import org.apache.crunch.test.Person;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.hadoop.conf.Configuration;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class CollectionDeepCopierTest {
-
- @Test
- public void testDeepCopy() {
- Person person = new Person();
- person.age = 42;
- person.name = "John Smith";
- person.siblingnames = Lists.<CharSequence> newArrayList();
-
- Collection<Person> personCollection = Lists.newArrayList(person);
- CollectionDeepCopier<Person> collectionDeepCopier = new CollectionDeepCopier<Person>(
- Avros.records(Person.class));
- collectionDeepCopier.initialize(new Configuration());
-
- Collection<Person> deepCopyCollection = collectionDeepCopier.deepCopy(personCollection);
-
- assertEquals(personCollection, deepCopyCollection);
- assertNotSame(personCollection.iterator().next(), deepCopyCollection.iterator().next());
- }
-
- @Test
- public void testNullDeepCopy() {
- CollectionDeepCopier<Person> collectionDeepCopier = new CollectionDeepCopier<Person>(
- Avros.records(Person.class));
- collectionDeepCopier.initialize(new Configuration());
- Collection<Person> nullCollection = null;
- assertNull(collectionDeepCopier.deepCopy(nullCollection));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/MapDeepCopierTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/MapDeepCopierTest.java b/crunch/src/test/java/org/apache/crunch/types/MapDeepCopierTest.java
deleted file mode 100644
index c13e4a2..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/MapDeepCopierTest.java
+++ /dev/null
@@ -1,63 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertNull;
-
-import java.util.Map;
-
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.hadoop.conf.Configuration;
-import org.junit.Test;
-
-import com.google.common.collect.Maps;
-
-public class MapDeepCopierTest {
-
- @Test
- public void testDeepCopy() {
- StringWrapper stringWrapper = new StringWrapper("value");
- String key = "key";
- Map<String, StringWrapper> map = Maps.newHashMap();
- map.put(key, stringWrapper);
-
- MapDeepCopier<StringWrapper> deepCopier = new MapDeepCopier<StringWrapper>(
- Avros.reflects(StringWrapper.class));
- deepCopier.initialize(new Configuration());
- Map<String, StringWrapper> deepCopy = deepCopier.deepCopy(map);
-
- assertEquals(map, deepCopy);
- assertNotSame(map.get(key), deepCopy.get(key));
- }
-
- @Test
- public void testDeepCopy_Null() {
- Map<String, StringWrapper> map = null;
-
- MapDeepCopier<StringWrapper> deepCopier = new MapDeepCopier<StringWrapper>(
- Avros.reflects(StringWrapper.class));
- deepCopier.initialize(new Configuration());
- Map<String, StringWrapper> deepCopy = deepCopier.deepCopy(map);
-
- assertNull(deepCopy);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/PTypeUtilsTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/PTypeUtilsTest.java b/crunch/src/test/java/org/apache/crunch/types/PTypeUtilsTest.java
deleted file mode 100644
index e6fd90c..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/PTypeUtilsTest.java
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-
-import java.util.Collection;
-
-import org.apache.avro.Schema;
-import org.apache.avro.util.Utf8;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.TupleN;
-import org.apache.crunch.types.avro.AvroType;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.apache.crunch.types.writable.Writables;
-import org.apache.hadoop.io.Text;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class PTypeUtilsTest {
- @Test
- public void testPrimitives() {
- assertEquals(Avros.strings(), AvroTypeFamily.getInstance().as(Writables.strings()));
- Assert.assertEquals(Writables.doubles(), WritableTypeFamily.getInstance().as(Avros.doubles()));
- }
-
- @Test
- public void testTuple3() {
- PType<Tuple3<String, Float, Integer>> t = Writables.triples(Writables.strings(), Writables.floats(),
- Writables.ints());
- PType<Tuple3<String, Float, Integer>> at = AvroTypeFamily.getInstance().as(t);
- assertEquals(Avros.strings(), at.getSubTypes().get(0));
- assertEquals(Avros.floats(), at.getSubTypes().get(1));
- assertEquals(Avros.ints(), at.getSubTypes().get(2));
- }
-
- @Test
- public void testTupleN() {
- PType<TupleN> t = Avros.tuples(Avros.strings(), Avros.floats(), Avros.ints());
- PType<TupleN> wt = WritableTypeFamily.getInstance().as(t);
- assertEquals(Writables.strings(), wt.getSubTypes().get(0));
- assertEquals(Writables.floats(), wt.getSubTypes().get(1));
- assertEquals(Writables.ints(), wt.getSubTypes().get(2));
- }
-
- @Test
- public void testWritableCollections() {
- PType<Collection<String>> t = Avros.collections(Avros.strings());
- t = WritableTypeFamily.getInstance().as(t);
- assertEquals(Writables.strings(), t.getSubTypes().get(0));
- }
-
- @Test
- public void testAvroCollections() {
- PType<Collection<Double>> t = Writables.collections(Writables.doubles());
- t = AvroTypeFamily.getInstance().as(t);
- assertEquals(Avros.doubles(), t.getSubTypes().get(0));
- }
-
- @Test
- public void testAvroRegistered() {
- AvroType<Utf8> at = new AvroType<Utf8>(Utf8.class, Schema.create(Schema.Type.STRING), new DeepCopier.NoOpDeepCopier<Utf8>());
- Avros.register(Utf8.class, at);
- assertEquals(at, Avros.records(Utf8.class));
- }
-
- @Test
- public void testWritableBuiltin() {
- assertNotNull(Writables.records(Text.class));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/PTypesTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/PTypesTest.java b/crunch/src/test/java/org/apache/crunch/types/PTypesTest.java
deleted file mode 100644
index d7c8811..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/PTypesTest.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import static org.junit.Assert.assertEquals;
-
-import java.util.UUID;
-
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.junit.Test;
-
-public class PTypesTest {
- @Test
- public void testUUID() throws Exception {
- UUID uuid = UUID.randomUUID();
- PType<UUID> ptype = PTypes.uuid(AvroTypeFamily.getInstance());
- assertEquals(uuid, ptype.getInputMapFn().map(ptype.getOutputMapFn().map(uuid)));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/TupleDeepCopierTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/TupleDeepCopierTest.java b/crunch/src/test/java/org/apache/crunch/types/TupleDeepCopierTest.java
deleted file mode 100644
index e46a680..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/TupleDeepCopierTest.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertNull;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.hadoop.conf.Configuration;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class TupleDeepCopierTest {
-
- @Test
- public void testDeepCopy_Pair() {
- Person person = new Person();
- person.name = "John Doe";
- person.age = 42;
- person.siblingnames = Lists.<CharSequence> newArrayList();
-
- Pair<Integer, Person> inputPair = Pair.of(1, person);
- DeepCopier<Pair> deepCopier = new TupleDeepCopier<Pair>(Pair.class, Avros.ints(),
- Avros.records(Person.class));
-
- deepCopier.initialize(new Configuration());
- Pair<Integer, Person> deepCopyPair = deepCopier.deepCopy(inputPair);
-
- assertEquals(inputPair, deepCopyPair);
- assertNotSame(inputPair.second(), deepCopyPair.second());
- }
-
- @Test
- public void testDeepCopy_PairContainingNull() {
-
- Pair<Integer, Person> inputPair = Pair.of(1, null);
- DeepCopier<Pair> deepCopier = new TupleDeepCopier<Pair>(Pair.class, Avros.ints(),
- Avros.records(Person.class));
-
- deepCopier.initialize(new Configuration());
- Pair<Integer, Person> deepCopyPair = deepCopier.deepCopy(inputPair);
-
- assertEquals(inputPair, deepCopyPair);
- }
-
- @Test
- public void testDeepCopy_NullPair() {
- Pair<Integer, Person> inputPair = null;
- DeepCopier<Pair> deepCopier = new TupleDeepCopier<Pair>(Pair.class, Avros.ints(),
- Avros.records(Person.class));
-
- deepCopier.initialize(new Configuration());
- Pair<Integer, Person> deepCopyPair = deepCopier.deepCopy(inputPair);
-
- assertNull(deepCopyPair);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/TupleFactoryTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/TupleFactoryTest.java b/crunch/src/test/java/org/apache/crunch/types/TupleFactoryTest.java
deleted file mode 100644
index 0726be2..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/TupleFactoryTest.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-import org.junit.Test;
-
-public class TupleFactoryTest {
-
- @Test
- public void testGetTupleFactory_Pair() {
- assertEquals(TupleFactory.PAIR, TupleFactory.getTupleFactory(Pair.class));
- }
-
- @Test
- public void testGetTupleFactory_Tuple3() {
- assertEquals(TupleFactory.TUPLE3, TupleFactory.getTupleFactory(Tuple3.class));
- }
-
- @Test
- public void testGetTupleFactory_Tuple4() {
- assertEquals(TupleFactory.TUPLE4, TupleFactory.getTupleFactory(Tuple4.class));
- }
-
- @Test
- public void testGetTupleFactory_TupleN() {
- assertEquals(TupleFactory.TUPLEN, TupleFactory.getTupleFactory(TupleN.class));
- }
-
- public void testGetTupleFactory_CustomTupleClass() {
- TupleFactory<CustomTupleImplementation> customTupleFactory = TupleFactory.create(CustomTupleImplementation.class);
- assertEquals(customTupleFactory, TupleFactory.getTupleFactory(CustomTupleImplementation.class));
- }
-
- private static class CustomTupleImplementation implements Tuple {
-
- @Override
- public Object get(int index) {
- return null;
- }
-
- @Override
- public int size() {
- return 0;
- }
-
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/avro/AvroDeepCopierTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/avro/AvroDeepCopierTest.java b/crunch/src/test/java/org/apache/crunch/types/avro/AvroDeepCopierTest.java
deleted file mode 100644
index 37c13c0..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/avro/AvroDeepCopierTest.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertNull;
-
-import java.util.List;
-
-import org.apache.avro.generic.GenericData.Record;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.types.avro.AvroDeepCopier.AvroSpecificDeepCopier;
-import org.apache.hadoop.conf.Configuration;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class AvroDeepCopierTest {
-
- @Test
- public void testDeepCopySpecific() {
- Person person = new Person();
- person.name = "John Doe";
- person.age = 42;
- person.siblingnames = Lists.<CharSequence> newArrayList();
-
- Person deepCopyPerson = new AvroSpecificDeepCopier<Person>(Person.class, Person.SCHEMA$)
- .deepCopy(person);
-
- assertEquals(person, deepCopyPerson);
- assertNotSame(person, deepCopyPerson);
- }
-
- @Test
- public void testDeepCopyGeneric() {
- Record record = new Record(Person.SCHEMA$);
- record.put("name", "John Doe");
- record.put("age", 42);
- record.put("siblingnames", Lists.newArrayList());
-
- Record deepCopyRecord = new AvroDeepCopier.AvroGenericDeepCopier(Person.SCHEMA$)
- .deepCopy(record);
-
- assertEquals(record, deepCopyRecord);
- assertNotSame(record, deepCopyRecord);
- }
-
- static class ReflectedPerson {
- String name;
- int age;
- List<String> siblingnames;
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !(other instanceof ReflectedPerson)) {
- return false;
- }
- ReflectedPerson that = (ReflectedPerson) other;
- return name.equals(that.name) && age == that.age && siblingnames.equals(that.siblingnames);
- }
- }
-
- @Test
- public void testDeepCopyReflect() {
- ReflectedPerson person = new ReflectedPerson();
- person.name = "John Doe";
- person.age = 42;
- person.siblingnames = Lists.newArrayList();
-
- AvroDeepCopier<ReflectedPerson> avroDeepCopier = new AvroDeepCopier.AvroReflectDeepCopier<ReflectedPerson>(
- ReflectedPerson.class, Avros.reflects(ReflectedPerson.class).getSchema());
- avroDeepCopier.initialize(new Configuration());
-
- ReflectedPerson deepCopyPerson = avroDeepCopier.deepCopy(person);
-
- assertEquals(person, deepCopyPerson);
- assertNotSame(person, deepCopyPerson);
-
- }
-
- @Test
- public void testDeepCopy_Null() {
- Person person = null;
-
- Person deepCopyPerson = new AvroSpecificDeepCopier<Person>(Person.class, Person.SCHEMA$)
- .deepCopy(person);
-
- assertNull(deepCopyPerson);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/avro/AvroGroupedTableTypeTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/avro/AvroGroupedTableTypeTest.java b/crunch/src/test/java/org/apache/crunch/types/avro/AvroGroupedTableTypeTest.java
deleted file mode 100644
index db9ebdc..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/avro/AvroGroupedTableTypeTest.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertSame;
-
-import java.util.List;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.types.PGroupedTableType;
-import org.apache.hadoop.conf.Configuration;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class AvroGroupedTableTypeTest {
-
- @Test
- public void testGetDetachedValue() {
- Integer integerValue = 42;
- Person person = new Person();
- person.name = "John Doe";
- person.age = 42;
- person.siblingnames = Lists.<CharSequence> newArrayList();
-
- Iterable<Person> inputPersonIterable = Lists.newArrayList(person);
- Pair<Integer, Iterable<Person>> pair = Pair.of(integerValue, inputPersonIterable);
-
- PGroupedTableType<Integer, Person> groupedTableType = Avros.tableOf(Avros.ints(),
- Avros.specifics(Person.class)).getGroupedTableType();
- groupedTableType.initialize(new Configuration());
-
- Pair<Integer, Iterable<Person>> detachedPair = groupedTableType.getDetachedValue(pair);
-
- assertSame(integerValue, detachedPair.first());
- List<Person> personList = Lists.newArrayList(detachedPair.second());
- assertEquals(inputPersonIterable, personList);
- assertNotSame(person, personList.get(0));
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/avro/AvroTableTypeTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/avro/AvroTableTypeTest.java b/crunch/src/test/java/org/apache/crunch/types/avro/AvroTableTypeTest.java
deleted file mode 100644
index 35d4e5b..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/avro/AvroTableTypeTest.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertSame;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.hadoop.conf.Configuration;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class AvroTableTypeTest {
-
- @Test
- public void testGetDetachedValue() {
- Integer integerValue = 42;
- Person person = new Person();
- person.name = "John Doe";
- person.age = 42;
- person.siblingnames = Lists.<CharSequence> newArrayList();
-
- Pair<Integer, Person> pair = Pair.of(integerValue, person);
-
- AvroTableType<Integer, Person> tableType = Avros.tableOf(Avros.ints(),
- Avros.specifics(Person.class));
- tableType.initialize(new Configuration());
-
- Pair<Integer, Person> detachedPair = tableType.getDetachedValue(pair);
-
- assertSame(integerValue, detachedPair.first());
- assertEquals(person, detachedPair.second());
- assertNotSame(person, detachedPair.second());
- }
-
- @Test
- public void testIsReflect_ContainsReflectKey() {
- assertTrue(Avros.tableOf(Avros.reflects(StringWrapper.class), Avros.ints()).hasReflect());
- }
-
- @Test
- public void testIsReflect_ContainsReflectValue() {
- assertTrue(Avros.tableOf(Avros.ints(), Avros.reflects(StringWrapper.class)).hasReflect());
- }
-
- @Test
- public void testReflect_NoReflectKeyOrValue() {
- assertFalse(Avros.tableOf(Avros.ints(), Avros.ints()).hasReflect());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/avro/AvroTypeTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/avro/AvroTypeTest.java b/crunch/src/test/java/org/apache/crunch/types/avro/AvroTypeTest.java
deleted file mode 100644
index a874c63..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/avro/AvroTypeTest.java
+++ /dev/null
@@ -1,279 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertSame;
-import static org.junit.Assert.assertTrue;
-
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.generic.GenericData.Record;
-import org.apache.crunch.Pair;
-import org.apache.crunch.TupleN;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.hadoop.conf.Configuration;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-public class AvroTypeTest {
-
- @Test
- public void testIsSpecific_SpecificData() {
- assertTrue(Avros.records(Person.class).hasSpecific());
- }
-
- @Test
- public void testIsGeneric_SpecificData() {
- assertFalse(Avros.records(Person.class).isGeneric());
- }
-
- @Test
- public void testIsSpecific_GenericData() {
- assertFalse(Avros.generics(Person.SCHEMA$).hasSpecific());
- }
-
- @Test
- public void testIsGeneric_GenericData() {
- assertTrue(Avros.generics(Person.SCHEMA$).isGeneric());
- }
-
- @Test
- public void testIsSpecific_NonAvroClass() {
- assertFalse(Avros.ints().hasSpecific());
- }
-
- @Test
- public void testIsGeneric_NonAvroClass() {
- assertFalse(Avros.ints().isGeneric());
- }
-
- @Test
- public void testIsSpecific_SpecificAvroTable() {
- assertTrue(Avros.tableOf(Avros.strings(), Avros.records(Person.class)).hasSpecific());
- }
-
- @Test
- public void testIsGeneric_SpecificAvroTable() {
- assertFalse(Avros.tableOf(Avros.strings(), Avros.records(Person.class)).isGeneric());
- }
-
- @Test
- public void testIsSpecific_GenericAvroTable() {
- assertFalse(Avros.tableOf(Avros.strings(), Avros.generics(Person.SCHEMA$)).hasSpecific());
- }
-
- @Test
- public void testIsGeneric_GenericAvroTable() {
- assertFalse(Avros.tableOf(Avros.strings(), Avros.generics(Person.SCHEMA$)).isGeneric());
- }
-
- @Test
- public void testIsReflect_GenericType() {
- assertFalse(Avros.generics(Person.SCHEMA$).hasReflect());
- }
-
- @Test
- public void testIsReflect_SpecificType() {
- assertFalse(Avros.records(Person.class).hasReflect());
- }
-
- @Test
- public void testIsReflect_ReflectSimpleType() {
- assertTrue(Avros.reflects(StringWrapper.class).hasReflect());
- }
-
- @Test
- public void testIsReflect_NonReflectSubType() {
- assertFalse(Avros.pairs(Avros.ints(), Avros.ints()).hasReflect());
- }
-
- @Test
- public void testIsReflect_ReflectSubType() {
- assertTrue(Avros.pairs(Avros.ints(), Avros.reflects(StringWrapper.class)).hasReflect());
- }
-
- @Test
- public void testIsReflect_TableOfNonReflectTypes() {
- assertFalse(Avros.tableOf(Avros.ints(), Avros.strings()).hasReflect());
- }
-
- @Test
- public void testIsReflect_TableWithReflectKey() {
- assertTrue(Avros.tableOf(Avros.reflects(StringWrapper.class), Avros.ints()).hasReflect());
- }
-
- @Test
- public void testIsReflect_TableWithReflectValue() {
- assertTrue(Avros.tableOf(Avros.ints(), Avros.reflects(StringWrapper.class)).hasReflect());
- }
-
- @Test
- public void testReflect_CollectionContainingReflectValue() {
- assertTrue(Avros.collections(Avros.reflects(StringWrapper.class)).hasReflect());
- }
-
- @Test
- public void testReflect_CollectionNotContainingReflectValue() {
- assertFalse(Avros.collections(Avros.generics(Person.SCHEMA$)).hasReflect());
- }
-
- @Test
- public void testGetDetachedValue_AlreadyMappedAvroType() {
- Integer value = 42;
- AvroType<Integer> intType = Avros.ints();
- intType.initialize(new Configuration());
- Integer detachedValue = intType.getDetachedValue(value);
- assertSame(value, detachedValue);
- }
-
- @Test
- public void testGetDetachedValue_GenericAvroType() {
- AvroType<Record> genericType = Avros.generics(Person.SCHEMA$);
- genericType.initialize(new Configuration());
- GenericData.Record record = new GenericData.Record(Person.SCHEMA$);
- record.put("name", "name value");
- record.put("age", 42);
- record.put("siblingnames", Lists.newArrayList());
-
- Record detachedRecord = genericType.getDetachedValue(record);
- assertEquals(record, detachedRecord);
- assertNotSame(record, detachedRecord);
- }
-
- private Person createPerson() {
- Person person = new Person();
- person.name = "name value";
- person.age = 42;
- person.siblingnames = Lists.<CharSequence> newArrayList();
- return person;
- }
-
- @Test
- public void testGetDetachedValue_SpecificAvroType() {
- AvroType<Person> specificType = Avros.specifics(Person.class);
- specificType.initialize(new Configuration());
- Person person = createPerson();
- Person detachedPerson = specificType.getDetachedValue(person);
- assertEquals(person, detachedPerson);
- assertNotSame(person, detachedPerson);
- }
-
- @Test(expected = IllegalStateException.class)
- public void testGetDetachedValue_NotInitialized() {
- AvroType<Person> specificType = Avros.specifics(Person.class);
- Person person = createPerson();
- specificType.getDetachedValue(person);
- }
-
- static class ReflectedPerson {
- String name;
- int age;
- List<String> siblingnames;
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !(other instanceof ReflectedPerson)) {
- return false;
- }
- ReflectedPerson that = (ReflectedPerson) other;
- return name.equals(that.name) && age == that.age && siblingnames.equals(that.siblingnames);
- }
- }
-
- @Test
- public void testGetDetachedValue_ReflectAvroType() {
- AvroType<ReflectedPerson> reflectType = Avros.reflects(ReflectedPerson.class);
- reflectType.initialize(new Configuration());
- ReflectedPerson rp = new ReflectedPerson();
- rp.name = "josh";
- rp.age = 32;
- rp.siblingnames = Lists.newArrayList();
- ReflectedPerson detached = reflectType.getDetachedValue(rp);
- assertEquals(rp, detached);
- assertNotSame(rp, detached);
- }
-
- @Test
- public void testGetDetachedValue_Pair() {
- Person person = createPerson();
- AvroType<Pair<Integer, Person>> pairType = Avros.pairs(Avros.ints(),
- Avros.records(Person.class));
- pairType.initialize(new Configuration());
-
- Pair<Integer, Person> inputPair = Pair.of(1, person);
- Pair<Integer, Person> detachedPair = pairType.getDetachedValue(inputPair);
-
- assertEquals(inputPair, detachedPair);
- assertNotSame(inputPair.second(), detachedPair.second());
- }
-
- @Test
- public void testGetDetachedValue_Collection() {
- Person person = createPerson();
- List<Person> personList = Lists.newArrayList(person);
-
- AvroType<Collection<Person>> collectionType = Avros.collections(Avros.records(Person.class));
- collectionType.initialize(new Configuration());
-
- Collection<Person> detachedCollection = collectionType.getDetachedValue(personList);
-
- assertEquals(personList, detachedCollection);
- Person detachedPerson = detachedCollection.iterator().next();
-
- assertNotSame(person, detachedPerson);
- }
-
- @Test
- public void testGetDetachedValue_Map() {
- String key = "key";
- Person value = createPerson();
-
- Map<String, Person> stringPersonMap = Maps.newHashMap();
- stringPersonMap.put(key, value);
-
- AvroType<Map<String, Person>> mapType = Avros.maps(Avros.records(Person.class));
- mapType.initialize(new Configuration());
-
- Map<String, Person> detachedMap = mapType.getDetachedValue(stringPersonMap);
-
- assertEquals(stringPersonMap, detachedMap);
- assertNotSame(value, detachedMap.get(key));
- }
-
- @Test
- public void testGetDetachedValue_TupleN() {
- Person person = createPerson();
- AvroType<TupleN> ptype = Avros.tuples(Avros.records(Person.class));
- ptype.initialize(new Configuration());
- TupleN tuple = new TupleN(person);
- TupleN detachedTuple = ptype.getDetachedValue(tuple);
-
- assertEquals(tuple, detachedTuple);
- assertNotSame(person, detachedTuple.get(0));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/avro/AvrosTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/avro/AvrosTest.java b/crunch/src/test/java/org/apache/crunch/types/avro/AvrosTest.java
deleted file mode 100644
index 5622a56..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/avro/AvrosTest.java
+++ /dev/null
@@ -1,325 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertTrue;
-
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.Collections;
-
-import org.apache.avro.Schema;
-import org.apache.avro.Schema.Type;
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.generic.GenericData.Record;
-import org.apache.avro.reflect.ReflectData;
-import org.apache.avro.util.Utf8;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-import org.apache.crunch.test.CrunchTestSupport;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.types.DeepCopier;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-
-/**
- * TODO test Avros.register and Avros.containers
- */
-public class AvrosTest {
-
- @Test
- public void testNulls() throws Exception {
- Void n = null;
- testInputOutputFn(Avros.nulls(), n, n);
- }
-
- @Test
- public void testStrings() throws Exception {
- String s = "abc";
- Utf8 w = new Utf8(s);
- testInputOutputFn(Avros.strings(), s, w);
- }
-
- @Test
- public void testInts() throws Exception {
- int j = 55;
- testInputOutputFn(Avros.ints(), j, j);
- }
-
- @Test
- public void testLongs() throws Exception {
- long j = Long.MAX_VALUE;
- testInputOutputFn(Avros.longs(), j, j);
- }
-
- @Test
- public void testFloats() throws Exception {
- float j = Float.MIN_VALUE;
- testInputOutputFn(Avros.floats(), j, j);
- }
-
- @Test
- public void testDoubles() throws Exception {
- double j = Double.MIN_VALUE;
- testInputOutputFn(Avros.doubles(), j, j);
- }
-
- @Test
- public void testBooleans() throws Exception {
- boolean j = true;
- testInputOutputFn(Avros.booleans(), j, j);
- }
-
- @Test
- public void testBytes() throws Exception {
- byte[] bytes = new byte[] { 17, 26, -98 };
- ByteBuffer bb = ByteBuffer.wrap(bytes);
- testInputOutputFn(Avros.bytes(), bb, bb);
- }
-
- @Test
- public void testCollections() throws Exception {
- Collection<String> j = Lists.newArrayList();
- j.add("a");
- j.add("b");
- Schema collectionSchema = Schema.createArray(Schema.createUnion(ImmutableList.of(Avros.strings().getSchema(),
- Schema.create(Type.NULL))));
- GenericData.Array<Utf8> w = new GenericData.Array<Utf8>(2, collectionSchema);
- w.add(new Utf8("a"));
- w.add(new Utf8("b"));
- testInputOutputFn(Avros.collections(Avros.strings()), j, w);
- }
-
- @Test
- public void testNestedTables() throws Exception {
- PTableType<Long, Long> pll = Avros.tableOf(Avros.longs(), Avros.longs());
- String schema = Avros.tableOf(pll, Avros.strings()).getSchema().toString();
- assertNotNull(schema);
- }
-
- @Test
- public void testPairs() throws Exception {
- AvroType<Pair<String, String>> at = Avros.pairs(Avros.strings(), Avros.strings());
- Pair<String, String> j = Pair.of("a", "b");
- GenericData.Record w = new GenericData.Record(at.getSchema());
- w.put(0, new Utf8("a"));
- w.put(1, new Utf8("b"));
- testInputOutputFn(at, j, w);
- }
-
- @Test
- public void testPairEquals() throws Exception {
- AvroType<Pair<Long, ByteBuffer>> at1 = Avros.pairs(Avros.longs(), Avros.bytes());
- AvroType<Pair<Long, ByteBuffer>> at2 = Avros.pairs(Avros.longs(), Avros.bytes());
- assertEquals(at1, at2);
- assertEquals(at1.hashCode(), at2.hashCode());
- }
-
- @Test
- @SuppressWarnings("rawtypes")
- public void testTriples() throws Exception {
- AvroType at = Avros.triples(Avros.strings(), Avros.strings(), Avros.strings());
- Tuple3 j = Tuple3.of("a", "b", "c");
- GenericData.Record w = new GenericData.Record(at.getSchema());
- w.put(0, new Utf8("a"));
- w.put(1, new Utf8("b"));
- w.put(2, new Utf8("c"));
- testInputOutputFn(at, j, w);
- }
-
- @Test
- @SuppressWarnings("rawtypes")
- public void testQuads() throws Exception {
- AvroType at = Avros.quads(Avros.strings(), Avros.strings(), Avros.strings(), Avros.strings());
- Tuple4 j = Tuple4.of("a", "b", "c", "d");
- GenericData.Record w = new GenericData.Record(at.getSchema());
- w.put(0, new Utf8("a"));
- w.put(1, new Utf8("b"));
- w.put(2, new Utf8("c"));
- w.put(3, new Utf8("d"));
- testInputOutputFn(at, j, w);
- }
-
- @Test
- @SuppressWarnings("rawtypes")
- public void testTupleN() throws Exception {
- AvroType at = Avros.tuples(Avros.strings(), Avros.strings(), Avros.strings(), Avros.strings(), Avros.strings());
- TupleN j = new TupleN("a", "b", "c", "d", "e");
- GenericData.Record w = new GenericData.Record(at.getSchema());
- w.put(0, new Utf8("a"));
- w.put(1, new Utf8("b"));
- w.put(2, new Utf8("c"));
- w.put(3, new Utf8("d"));
- w.put(4, new Utf8("e"));
- testInputOutputFn(at, j, w);
-
- }
-
- @Test
- @SuppressWarnings("rawtypes")
- public void testWritables() throws Exception {
- AvroType at = Avros.writables(LongWritable.class);
-
- TaskInputOutputContext<?, ?, ?, ?> testContext = CrunchTestSupport.getTestContext(new Configuration());
- at.getInputMapFn().setContext(testContext);
- at.getInputMapFn().initialize();
- at.getOutputMapFn().setContext(testContext);
- at.getOutputMapFn().initialize();
-
- LongWritable lw = new LongWritable(1729L);
- assertEquals(lw, at.getInputMapFn().map(at.getOutputMapFn().map(lw)));
- }
-
- @Test
- @SuppressWarnings("rawtypes")
- public void testTableOf() throws Exception {
- AvroType at = Avros.tableOf(Avros.strings(), Avros.strings());
- Pair<String, String> j = Pair.of("a", "b");
- org.apache.avro.mapred.Pair w = new org.apache.avro.mapred.Pair(at.getSchema());
- w.put(0, new Utf8("a"));
- w.put(1, new Utf8("b"));
- // TODO update this after resolving the o.a.a.m.Pair.equals issue
- initialize(at);
- assertEquals(j, at.getInputMapFn().map(w));
- org.apache.avro.mapred.Pair converted = (org.apache.avro.mapred.Pair) at.getOutputMapFn().map(j);
- assertEquals(w.key(), converted.key());
- assertEquals(w.value(), converted.value());
- }
-
- private static void initialize(PType ptype) {
- ptype.getInputMapFn().initialize();
- ptype.getOutputMapFn().initialize();
- }
-
- @SuppressWarnings({ "unchecked", "rawtypes" })
- protected static void testInputOutputFn(PType ptype, Object java, Object avro) {
- initialize(ptype);
- assertEquals(java, ptype.getInputMapFn().map(avro));
- assertEquals(avro, ptype.getOutputMapFn().map(java));
- }
-
- @Test
- public void testIsPrimitive_PrimitiveMappedType() {
- assertTrue(Avros.isPrimitive(Avros.ints()));
- }
-
- @Test
- public void testIsPrimitive_TruePrimitiveValue() {
- AvroType truePrimitiveAvroType = new AvroType(int.class, Schema.create(Type.INT), new DeepCopier.NoOpDeepCopier());
- assertTrue(Avros.isPrimitive(truePrimitiveAvroType));
- }
-
- @Test
- public void testIsPrimitive_False() {
- assertFalse(Avros.isPrimitive(Avros.reflects(Person.class)));
- }
-
- @Test
- public void testPairs_Generic() {
- Schema schema = ReflectData.get().getSchema(IntWritable.class);
-
- GenericData.Record recordA = new GenericData.Record(schema);
- GenericData.Record recordB = new GenericData.Record(schema);
-
- AvroType<Pair<Record, Record>> pairType = Avros.pairs(Avros.generics(schema), Avros.generics(schema));
- Pair<Record, Record> pair = Pair.of(recordA, recordB);
- pairType.getOutputMapFn().initialize();
- pairType.getInputMapFn().initialize();
- Object mapped = pairType.getOutputMapFn().map(pair);
- Pair<Record, Record> doubleMappedPair = pairType.getInputMapFn().map(mapped);
-
- assertEquals(pair, doubleMappedPair);
- mapped.hashCode();
- }
-
- @Test
- public void testPairs_Reflect() {
- IntWritable intWritableA = new IntWritable(1);
- IntWritable intWritableB = new IntWritable(2);
-
- AvroType<Pair<IntWritable, IntWritable>> pairType = Avros.pairs(Avros.reflects(IntWritable.class),
- Avros.reflects(IntWritable.class));
- Pair<IntWritable, IntWritable> pair = Pair.of(intWritableA, intWritableB);
- pairType.getOutputMapFn().initialize();
- pairType.getInputMapFn().initialize();
- Object mapped = pairType.getOutputMapFn().map(pair);
-
- Pair<IntWritable, IntWritable> doubleMappedPair = pairType.getInputMapFn().map(mapped);
-
- assertEquals(pair, doubleMappedPair);
- }
-
- @Test
- public void testPairs_Specific() {
- Person personA = new Person();
- Person personB = new Person();
-
- personA.age = 1;
- personA.name = "A";
- personA.siblingnames = Collections.<CharSequence> emptyList();
-
- personB.age = 2;
- personB.name = "B";
- personB.siblingnames = Collections.<CharSequence> emptyList();
-
- AvroType<Pair<Person, Person>> pairType = Avros.pairs(Avros.records(Person.class), Avros.records(Person.class));
-
- Pair<Person, Person> pair = Pair.of(personA, personB);
- pairType.getOutputMapFn().initialize();
- pairType.getInputMapFn().initialize();
-
- Object mapped = pairType.getOutputMapFn().map(pair);
- Pair<Person, Person> doubleMappedPair = pairType.getInputMapFn().map(mapped);
-
- assertEquals(pair, doubleMappedPair);
-
- }
-
- @Test
- public void testPairOutputMapFn_VerifyNoObjectReuse() {
- StringWrapper stringWrapper = new StringWrapper("Test");
-
- Pair<Integer, StringWrapper> pair = Pair.of(1, stringWrapper);
-
- AvroType<Pair<Integer, StringWrapper>> pairType = Avros.pairs(Avros.ints(), Avros.reflects(StringWrapper.class));
-
- pairType.getOutputMapFn().initialize();
-
- Object outputMappedValueA = pairType.getOutputMapFn().map(pair);
- Object outputMappedValueB = pairType.getOutputMapFn().map(pair);
-
- assertEquals(outputMappedValueA, outputMappedValueB);
- assertNotSame(outputMappedValueA, outputMappedValueB);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/writable/GenericArrayWritableTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/writable/GenericArrayWritableTest.java b/crunch/src/test/java/org/apache/crunch/types/writable/GenericArrayWritableTest.java
deleted file mode 100644
index c807a90..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/writable/GenericArrayWritableTest.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import static org.hamcrest.Matchers.hasItems;
-import static org.hamcrest.Matchers.is;
-import static org.hamcrest.Matchers.not;
-import static org.hamcrest.Matchers.sameInstance;
-import static org.junit.Assert.assertThat;
-
-import java.util.Arrays;
-
-import org.apache.crunch.test.Tests;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.junit.Test;
-
-
-public class GenericArrayWritableTest {
-
- @Test
- public void testEmpty() {
- GenericArrayWritable<Text> src = new GenericArrayWritable<Text>(Text.class);
- src.set(new Text[0]);
-
- GenericArrayWritable<Text> dest = Tests.roundtrip(src, new GenericArrayWritable<Text>());
-
- assertThat(dest.get().length, is(0));
- }
-
- @Test
- public void testNonEmpty() {
- GenericArrayWritable<Text> src = new GenericArrayWritable<Text>(Text.class);
- src.set(new Text[] { new Text("foo"), new Text("bar") });
-
- GenericArrayWritable<Text> dest = Tests.roundtrip(src, new GenericArrayWritable<Text>());
-
- assertThat(src.get(), not(sameInstance(dest.get())));
- assertThat(dest.get().length, is(2));
- assertThat(Arrays.asList(dest.get()), hasItems((Writable) new Text("foo"), new Text("bar")));
- }
-
- @Test
- public void testNulls() {
- GenericArrayWritable<Text> src = new GenericArrayWritable<Text>(Text.class);
- src.set(new Text[] { new Text("a"), null, new Text("b") });
-
- GenericArrayWritable<Text> dest = Tests.roundtrip(src, new GenericArrayWritable<Text>());
-
- assertThat(src.get(), not(sameInstance(dest.get())));
- assertThat(dest.get().length, is(3));
- assertThat(Arrays.asList(dest.get()), hasItems((Writable) new Text("a"), new Text("b"), null));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/writable/WritableDeepCopierTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/writable/WritableDeepCopierTest.java b/crunch/src/test/java/org/apache/crunch/types/writable/WritableDeepCopierTest.java
deleted file mode 100644
index c49491b..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/writable/WritableDeepCopierTest.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertNull;
-
-import org.apache.hadoop.io.Text;
-import org.junit.Before;
-import org.junit.Test;
-
-public class WritableDeepCopierTest {
-
- private WritableDeepCopier<Text> deepCopier;
-
- @Before
- public void setUp() {
- deepCopier = new WritableDeepCopier<Text>(Text.class);
- }
-
- @Test
- public void testDeepCopy() {
- Text text = new Text("value");
- Text deepCopy = deepCopier.deepCopy(text);
-
- assertEquals(text, deepCopy);
- assertNotSame(text, deepCopy);
- }
-
- @Test
- public void testDeepCopy_Null() {
- Text text = null;
- Text deepCopy = deepCopier.deepCopy(text);
-
- assertNull(deepCopy);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/writable/WritableGroupedTableTypeTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/writable/WritableGroupedTableTypeTest.java b/crunch/src/test/java/org/apache/crunch/types/writable/WritableGroupedTableTypeTest.java
deleted file mode 100644
index f6c201b..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/writable/WritableGroupedTableTypeTest.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertSame;
-
-import java.util.List;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.PGroupedTableType;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class WritableGroupedTableTypeTest {
-
- @Test
- public void testGetDetachedValue() {
- Integer integerValue = 42;
- Text textValue = new Text("forty-two");
- Iterable<Text> inputTextIterable = Lists.newArrayList(textValue);
- Pair<Integer, Iterable<Text>> pair = Pair.of(integerValue, inputTextIterable);
-
- PGroupedTableType<Integer, Text> groupedTableType = Writables.tableOf(Writables.ints(),
- Writables.writables(Text.class)).getGroupedTableType();
- groupedTableType.initialize(new Configuration());
-
- Pair<Integer, Iterable<Text>> detachedPair = groupedTableType.getDetachedValue(pair);
-
- assertSame(integerValue, detachedPair.first());
- List<Text> textList = Lists.newArrayList(detachedPair.second());
- assertEquals(inputTextIterable, textList);
- assertNotSame(textValue, textList.get(0));
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/writable/WritableTableTypeTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/writable/WritableTableTypeTest.java b/crunch/src/test/java/org/apache/crunch/types/writable/WritableTableTypeTest.java
deleted file mode 100644
index 697a28c..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/writable/WritableTableTypeTest.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotSame;
-import static org.junit.Assert.assertSame;
-
-import org.apache.crunch.Pair;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.junit.Test;
-
-public class WritableTableTypeTest {
-
- @Test
- public void testGetDetachedValue() {
- Integer integerValue = 42;
- Text textValue = new Text("forty-two");
- Pair<Integer, Text> pair = Pair.of(integerValue, textValue);
-
- WritableTableType<Integer, Text> tableType = Writables.tableOf(Writables.ints(),
- Writables.writables(Text.class));
- tableType.initialize(new Configuration());
- Pair<Integer, Text> detachedPair = tableType.getDetachedValue(pair);
-
- assertSame(integerValue, detachedPair.first());
- assertEquals(textValue, detachedPair.second());
- assertNotSame(textValue, detachedPair.second());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/test/java/org/apache/crunch/types/writable/WritableTypeTest.java
----------------------------------------------------------------------
diff --git a/crunch/src/test/java/org/apache/crunch/types/writable/WritableTypeTest.java b/crunch/src/test/java/org/apache/crunch/types/writable/WritableTypeTest.java
deleted file mode 100644
index 65e946b..0000000
--- a/crunch/src/test/java/org/apache/crunch/types/writable/WritableTypeTest.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.writable;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotSame;
-
-import java.util.Collection;
-import java.util.Map;
-
-import org.apache.crunch.Pair;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.MapWritable;
-import org.apache.hadoop.io.Text;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-public class WritableTypeTest {
-
- @Test(expected = IllegalStateException.class)
- public void testGetDetachedValue_NotInitialized() {
- WritableType<Text, Text> textWritableType = Writables.writables(Text.class);
- Text value = new Text("test");
-
- // Calling getDetachedValue without first calling initialize should throw an
- // exception
- textWritableType.getDetachedValue(value);
- }
-
- @Test
- public void testGetDetachedValue_CustomWritable() {
- WritableType<Text, Text> textWritableType = Writables.writables(Text.class);
- textWritableType.initialize(new Configuration());
- Text value = new Text("test");
-
- Text detachedValue = textWritableType.getDetachedValue(value);
- assertEquals(value, detachedValue);
- assertNotSame(value, detachedValue);
- }
-
- @Test
- public void testGetDetachedValue_Collection() {
- Collection<Text> textCollection = Lists.newArrayList(new Text("value"));
- WritableType<Collection<Text>, GenericArrayWritable<Text>> ptype = Writables
- .collections(Writables.writables(Text.class));
- ptype.initialize(new Configuration());
-
- Collection<Text> detachedCollection = ptype.getDetachedValue(textCollection);
- assertEquals(textCollection, detachedCollection);
- assertNotSame(textCollection.iterator().next(), detachedCollection.iterator().next());
- }
-
- @Test
- public void testGetDetachedValue_Tuple() {
- Pair<Text, Text> textPair = Pair.of(new Text("one"), new Text("two"));
- WritableType<Pair<Text, Text>, TupleWritable> ptype = Writables.pairs(
- Writables.writables(Text.class), Writables.writables(Text.class));
- ptype.initialize(new Configuration());
-
- Pair<Text, Text> detachedPair = ptype.getDetachedValue(textPair);
- assertEquals(textPair, detachedPair);
- assertNotSame(textPair.first(), detachedPair.first());
- assertNotSame(textPair.second(), detachedPair.second());
- }
-
- @Test
- public void testGetDetachedValue_Map() {
- Map<String, Text> stringTextMap = Maps.newHashMap();
- stringTextMap.put("key", new Text("value"));
-
- WritableType<Map<String, Text>, MapWritable> ptype = Writables.maps(Writables
- .writables(Text.class));
- ptype.initialize(new Configuration());
- Map<String, Text> detachedMap = ptype.getDetachedValue(stringTextMap);
-
- assertEquals(stringTextMap, detachedMap);
- assertNotSame(stringTextMap.get("key"), detachedMap.get("key"));
- }
-
-}
[43/43] git commit: CRUNCH-196: crunch -> crunch-core rename to fix
build issues
Posted by jw...@apache.org.
CRUNCH-196: crunch -> crunch-core rename to fix build issues
Project: http://git-wip-us.apache.org/repos/asf/crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/crunch/commit/890e0086
Tree: http://git-wip-us.apache.org/repos/asf/crunch/tree/890e0086
Diff: http://git-wip-us.apache.org/repos/asf/crunch/diff/890e0086
Branch: refs/heads/master
Commit: 890e0086a12df5006a23cfdd86f3703f929cb147
Parents: cbc7c2f
Author: Josh Wills <jw...@cloudera.com>
Authored: Tue Apr 23 13:37:16 2013 -0700
Committer: Josh Wills <jw...@cloudera.com>
Committed: Tue Apr 23 13:39:00 2013 -0700
----------------------------------------------------------------------
crunch-contrib/pom.xml | 2 +-
crunch-core/pom.xml | 182 +
.../it/java/org/apache/crunch/CancelJobsIT.java | 84 +
.../src/it/java/org/apache/crunch/CleanTextIT.java | 82 +
.../org/apache/crunch/CollectionPObjectIT.java | 98 +
.../it/java/org/apache/crunch/CollectionsIT.java | 117 +
.../org/apache/crunch/CollectionsLengthIT.java | 70 +
.../org/apache/crunch/DeepCopyCustomTuplesIT.java | 79 +
.../src/it/java/org/apache/crunch/EnumPairIT.java | 59 +
.../org/apache/crunch/FirstElementPObjectIT.java | 61 +
.../apache/crunch/IterableReuseProtectionIT.java | 89 +
.../it/java/org/apache/crunch/MRPipelineIT.java | 78 +
.../it/java/org/apache/crunch/MapPObjectIT.java | 101 +
.../src/it/java/org/apache/crunch/MapsIT.java | 101 +
.../it/java/org/apache/crunch/MaterializeIT.java | 139 +
.../java/org/apache/crunch/MaterializeToMapIT.java | 81 +
.../java/org/apache/crunch/MultipleOutputIT.java | 175 +
.../org/apache/crunch/PCollectionGetSizeIT.java | 151 +
.../src/it/java/org/apache/crunch/PObjectsIT.java | 99 +
.../java/org/apache/crunch/PTableKeyValueIT.java | 103 +
.../src/it/java/org/apache/crunch/PageRankIT.java | 168 +
.../org/apache/crunch/StageResultsCountersIT.java | 135 +
.../it/java/org/apache/crunch/TermFrequencyIT.java | 135 +
.../src/it/java/org/apache/crunch/TextPairIT.java | 72 +
.../src/it/java/org/apache/crunch/TfIdfIT.java | 224 +
.../org/apache/crunch/TupleNClassCastBugIT.java | 95 +
.../org/apache/crunch/UnionFromSameSourceIT.java | 132 +
.../src/it/java/org/apache/crunch/UnionIT.java | 136 +
.../it/java/org/apache/crunch/UnionResultsIT.java | 80 +
.../src/it/java/org/apache/crunch/WordCountIT.java | 171 +
.../java/org/apache/crunch/fn/AggregatorsIT.java | 83 +
.../crunch/impl/mem/MemPipelineFileWritingIT.java | 58 +
.../crunch/impl/mr/collect/UnionCollectionIT.java | 154 +
.../apache/crunch/io/CompositePathIterableIT.java | 84 +
.../it/java/org/apache/crunch/io/NLineInputIT.java | 72 +
.../java/org/apache/crunch/io/TextFileTableIT.java | 56 +
.../crunch/io/avro/AvroFileSourceTargetIT.java | 140 +
.../org/apache/crunch/io/avro/AvroPipelineIT.java | 95 +
.../org/apache/crunch/io/avro/AvroReflectIT.java | 109 +
.../org/apache/crunch/io/avro/AvroWritableIT.java | 89 +
.../it/java/org/apache/crunch/lib/AggregateIT.java | 231 +
.../java/org/apache/crunch/lib/AvroTypeSortIT.java | 145 +
.../it/java/org/apache/crunch/lib/CogroupIT.java | 112 +
.../org/apache/crunch/lib/SecondarySortIT.java | 65 +
.../src/it/java/org/apache/crunch/lib/SetIT.java | 114 +
.../java/org/apache/crunch/lib/SortByValueIT.java | 84 +
.../src/it/java/org/apache/crunch/lib/SortIT.java | 327 +
.../apache/crunch/lib/SpecificAvroGroupByIT.java | 119 +
.../apache/crunch/lib/join/FullOuterJoinIT.java | 51 +
.../org/apache/crunch/lib/join/InnerJoinIT.java | 51 +
.../org/apache/crunch/lib/join/JoinTester.java | 108 +
.../apache/crunch/lib/join/LeftOuterJoinIT.java | 51 +
.../org/apache/crunch/lib/join/MapsideJoinIT.java | 158 +
.../crunch/lib/join/MultiAvroSchemaJoinIT.java | 121 +
.../apache/crunch/lib/join/RightOuterJoinIT.java | 51 +
.../org/apache/crunch/test/TemporaryPaths.java | 40 +
.../src/it/java/org/apache/crunch/test/Tests.java | 124 +
crunch-core/src/it/resources/customers.txt | 4 +
crunch-core/src/it/resources/docs.txt | 6 +
crunch-core/src/it/resources/letters.txt | 2 +
crunch-core/src/it/resources/log4j.properties | 29 +
crunch-core/src/it/resources/maugham.txt |29112 +++++++++++++++
crunch-core/src/it/resources/orders.txt | 4 +
.../org/apache/crunch/UnionITData/src1.txt | 5 +
.../org/apache/crunch/UnionITData/src2.txt | 3 +
.../apache/crunch/fn/AggregatorsITData/ints.txt | 5 +
.../org/apache/crunch/lib/CogroupITData/src1.txt | 4 +
.../org/apache/crunch/lib/CogroupITData/src2.txt | 4 +
.../src/it/resources/secondary_sort_input.txt | 7 +
crunch-core/src/it/resources/set1.txt | 4 +
crunch-core/src/it/resources/set2.txt | 3 +
crunch-core/src/it/resources/shakes.txt | 3667 ++
crunch-core/src/it/resources/sort_by_value.txt | 5 +
crunch-core/src/it/resources/urls.txt | 11 +
.../main/java/org/apache/crunch/Aggregator.java | 86 +
.../src/main/java/org/apache/crunch/CombineFn.java | 1211 +
.../org/apache/crunch/CrunchRuntimeException.java | 54 +
.../src/main/java/org/apache/crunch/DoFn.java | 162 +
.../src/main/java/org/apache/crunch/Emitter.java | 37 +
.../src/main/java/org/apache/crunch/FilterFn.java | 244 +
.../java/org/apache/crunch/GroupingOptions.java | 167 +
.../src/main/java/org/apache/crunch/MapFn.java | 41 +
.../main/java/org/apache/crunch/PCollection.java | 245 +
.../main/java/org/apache/crunch/PGroupedTable.java | 53 +
.../src/main/java/org/apache/crunch/PObject.java | 36 +
.../src/main/java/org/apache/crunch/PTable.java | 181 +
.../src/main/java/org/apache/crunch/Pair.java | 105 +
.../java/org/apache/crunch/ParallelDoOptions.java | 62 +
.../src/main/java/org/apache/crunch/Pipeline.java | 138 +
.../java/org/apache/crunch/PipelineExecution.java | 54 +
.../java/org/apache/crunch/PipelineResult.java | 76 +
.../src/main/java/org/apache/crunch/Source.java | 52 +
.../main/java/org/apache/crunch/SourceTarget.java | 26 +
.../main/java/org/apache/crunch/TableSource.java | 28 +
.../java/org/apache/crunch/TableSourceTarget.java | 25 +
.../src/main/java/org/apache/crunch/Target.java | 83 +
.../src/main/java/org/apache/crunch/Tuple.java | 36 +
.../src/main/java/org/apache/crunch/Tuple3.java | 96 +
.../src/main/java/org/apache/crunch/Tuple4.java | 105 +
.../src/main/java/org/apache/crunch/TupleN.java | 73 +
.../java/org/apache/crunch/fn/Aggregators.java | 1111 +
.../java/org/apache/crunch/fn/CompositeMapFn.java | 71 +
.../java/org/apache/crunch/fn/ExtractKeyFn.java | 50 +
.../main/java/org/apache/crunch/fn/FilterFns.java | 112 +
.../main/java/org/apache/crunch/fn/IdentityFn.java | 39 +
.../main/java/org/apache/crunch/fn/MapKeysFn.java | 32 +
.../java/org/apache/crunch/fn/MapValuesFn.java | 32 +
.../main/java/org/apache/crunch/fn/PairMapFn.java | 65 +
.../java/org/apache/crunch/fn/package-info.java | 22 +
.../mapreduce/TaskAttemptContextFactory.java | 70 +
.../lib/jobcontrol/CrunchControlledJob.java | 325 +
.../mapreduce/lib/jobcontrol/CrunchJobControl.java | 211 +
.../org/apache/crunch/impl/SingleUseIterable.java | 49 +
.../org/apache/crunch/impl/mem/MemPipeline.java | 275 +
.../crunch/impl/mem/collect/MemCollection.java | 295 +
.../crunch/impl/mem/collect/MemGroupedTable.java | 113 +
.../apache/crunch/impl/mem/collect/MemTable.java | 177 +
.../apache/crunch/impl/mem/collect/Shuffler.java | 149 +
.../crunch/impl/mem/emit/InMemoryEmitter.java | 57 +
.../org/apache/crunch/impl/mem/package-info.java | 22 +
.../java/org/apache/crunch/impl/mr/MRPipeline.java | 396 +
.../crunch/impl/mr/collect/DoCollectionImpl.java | 74 +
.../apache/crunch/impl/mr/collect/DoTableImpl.java | 84 +
.../crunch/impl/mr/collect/InputCollection.java | 85 +
.../apache/crunch/impl/mr/collect/InputTable.java | 86 +
.../crunch/impl/mr/collect/PCollectionImpl.java | 295 +
.../crunch/impl/mr/collect/PGroupedTableImpl.java | 144 +
.../apache/crunch/impl/mr/collect/PTableBase.java | 169 +
.../crunch/impl/mr/collect/UnionCollection.java | 80 +
.../apache/crunch/impl/mr/collect/UnionTable.java | 92 +
.../crunch/impl/mr/emit/IntermediateEmitter.java | 64 +
.../crunch/impl/mr/emit/MultipleOutputEmitter.java | 56 +
.../apache/crunch/impl/mr/emit/OutputEmitter.java | 52 +
.../impl/mr/exec/CappedExponentialCounter.java | 40 +
.../apache/crunch/impl/mr/exec/CrunchJobHooks.java | 153 +
.../org/apache/crunch/impl/mr/exec/MRExecutor.java | 198 +
.../org/apache/crunch/impl/mr/package-info.java | 22 +
.../org/apache/crunch/impl/mr/plan/DoNode.java | 163 +
.../apache/crunch/impl/mr/plan/DotfileWriter.java | 238 +
.../java/org/apache/crunch/impl/mr/plan/Edge.java | 125 +
.../java/org/apache/crunch/impl/mr/plan/Graph.java | 133 +
.../apache/crunch/impl/mr/plan/GraphBuilder.java | 92 +
.../apache/crunch/impl/mr/plan/JobNameBuilder.java | 79 +
.../apache/crunch/impl/mr/plan/JobPrototype.java | 245 +
.../crunch/impl/mr/plan/MSCROutputHandler.java | 77 +
.../apache/crunch/impl/mr/plan/MSCRPlanner.java | 378 +
.../org/apache/crunch/impl/mr/plan/NodePath.java | 124 +
.../crunch/impl/mr/plan/PlanningParameters.java | 38 +
.../org/apache/crunch/impl/mr/plan/Vertex.java | 126 +
.../apache/crunch/impl/mr/run/CrunchCombiner.java | 27 +
.../crunch/impl/mr/run/CrunchInputFormat.java | 78 +
.../crunch/impl/mr/run/CrunchInputSplit.java | 116 +
.../apache/crunch/impl/mr/run/CrunchMapper.java | 73 +
.../crunch/impl/mr/run/CrunchRecordReader.java | 75 +
.../apache/crunch/impl/mr/run/CrunchReducer.java | 73 +
.../crunch/impl/mr/run/CrunchTaskContext.java | 86 +
.../org/apache/crunch/impl/mr/run/NodeContext.java | 35 +
.../java/org/apache/crunch/impl/mr/run/RTNode.java | 124 +
.../crunch/impl/mr/run/RuntimeParameters.java | 38 +
.../src/main/java/org/apache/crunch/io/At.java | 257 +
.../apache/crunch/io/CompositePathIterable.java | 102 +
.../java/org/apache/crunch/io/CrunchInputs.java | 71 +
.../java/org/apache/crunch/io/CrunchOutputs.java | 184 +
.../org/apache/crunch/io/FileNamingScheme.java | 58 +
.../org/apache/crunch/io/FileReaderFactory.java | 27 +
.../java/org/apache/crunch/io/FormatBundle.java | 121 +
.../src/main/java/org/apache/crunch/io/From.java | 324 +
.../java/org/apache/crunch/io/MapReduceTarget.java | 27 +
.../java/org/apache/crunch/io/OutputHandler.java | 25 +
.../main/java/org/apache/crunch/io/PathTarget.java | 36 +
.../java/org/apache/crunch/io/PathTargetImpl.java | 64 +
.../java/org/apache/crunch/io/ReadableSource.java | 41 +
.../org/apache/crunch/io/ReadableSourceTarget.java | 30 +
.../crunch/io/SequentialFileNamingScheme.java | 51 +
.../org/apache/crunch/io/SourceTargetHelper.java | 48 +
.../src/main/java/org/apache/crunch/io/To.java | 153 +
.../crunch/io/avro/AvroFileReaderFactory.java | 96 +
.../org/apache/crunch/io/avro/AvroFileSource.java | 58 +
.../crunch/io/avro/AvroFileSourceTarget.java | 39 +
.../org/apache/crunch/io/avro/AvroFileTarget.java | 91 +
.../apache/crunch/io/impl/AutoClosingIterator.java | 62 +
.../org/apache/crunch/io/impl/FileSourceImpl.java | 104 +
.../apache/crunch/io/impl/FileTableSourceImpl.java | 41 +
.../org/apache/crunch/io/impl/FileTargetImpl.java | 162 +
.../io/impl/ReadableSourcePathTargetImpl.java | 39 +
.../crunch/io/impl/ReadableSourceTargetImpl.java | 37 +
.../crunch/io/impl/SourcePathTargetImpl.java | 50 +
.../apache/crunch/io/impl/SourceTargetImpl.java | 89 +
.../crunch/io/impl/TableSourcePathTargetImpl.java | 41 +
.../crunch/io/impl/TableSourceTargetImpl.java | 35 +
.../java/org/apache/crunch/io/package-info.java | 22 +
.../org/apache/crunch/io/seq/SeqFileHelper.java | 35 +
.../apache/crunch/io/seq/SeqFileReaderFactory.java | 112 +
.../org/apache/crunch/io/seq/SeqFileSource.java | 47 +
.../apache/crunch/io/seq/SeqFileSourceTarget.java | 44 +
.../apache/crunch/io/seq/SeqFileTableSource.java | 57 +
.../crunch/io/seq/SeqFileTableSourceTarget.java | 54 +
.../org/apache/crunch/io/seq/SeqFileTarget.java | 55 +
.../crunch/io/text/BZip2TextInputFormat.java | 235 +
.../apache/crunch/io/text/CBZip2InputStream.java | 980 +
.../java/org/apache/crunch/io/text/LineParser.java | 125 +
.../org/apache/crunch/io/text/NLineFileSource.java | 77 +
.../crunch/io/text/TextFileReaderFactory.java | 83 +
.../org/apache/crunch/io/text/TextFileSource.java | 73 +
.../crunch/io/text/TextFileSourceTarget.java | 44 +
.../apache/crunch/io/text/TextFileTableSource.java | 81 +
.../crunch/io/text/TextFileTableSourceTarget.java | 63 +
.../org/apache/crunch/io/text/TextFileTarget.java | 109 +
.../main/java/org/apache/crunch/lib/Aggregate.java | 272 +
.../main/java/org/apache/crunch/lib/Cartesian.java | 216 +
.../main/java/org/apache/crunch/lib/Cogroup.java | 106 +
.../main/java/org/apache/crunch/lib/Distinct.java | 126 +
.../src/main/java/org/apache/crunch/lib/Join.java | 181 +
.../main/java/org/apache/crunch/lib/PTables.java | 117 +
.../main/java/org/apache/crunch/lib/Sample.java | 217 +
.../java/org/apache/crunch/lib/SampleUtils.java | 168 +
.../java/org/apache/crunch/lib/SecondarySort.java | 118 +
.../src/main/java/org/apache/crunch/lib/Set.java | 118 +
.../src/main/java/org/apache/crunch/lib/Sort.java | 294 +
.../apache/crunch/lib/join/FullOuterJoinFn.java | 102 +
.../org/apache/crunch/lib/join/InnerJoinFn.java | 78 +
.../java/org/apache/crunch/lib/join/JoinFn.java | 81 +
.../java/org/apache/crunch/lib/join/JoinUtils.java | 126 +
.../apache/crunch/lib/join/LeftOuterJoinFn.java | 98 +
.../org/apache/crunch/lib/join/MapsideJoin.java | 164 +
.../apache/crunch/lib/join/RightOuterJoinFn.java | 83 +
.../org/apache/crunch/lib/join/package-info.java | 22 +
.../java/org/apache/crunch/lib/package-info.java | 22 +
.../org/apache/crunch/lib/sort/Comparators.java | 187 +
.../java/org/apache/crunch/lib/sort/SortFns.java | 210 +
.../crunch/lib/sort/TotalOrderPartitioner.java | 145 +
.../crunch/materialize/MaterializableIterable.java | 81 +
.../crunch/materialize/MaterializableMap.java | 50 +
.../materialize/pobject/CollectionPObject.java | 55 +
.../materialize/pobject/FirstElementPObject.java | 50 +
.../crunch/materialize/pobject/MapPObject.java | 62 +
.../crunch/materialize/pobject/PObjectImpl.java | 85 +
.../main/java/org/apache/crunch/package-info.java | 25 +
.../apache/crunch/types/CollectionDeepCopier.java | 57 +
.../java/org/apache/crunch/types/Converter.java | 41 +
.../java/org/apache/crunch/types/DeepCopier.java | 60 +
.../org/apache/crunch/types/MapDeepCopier.java | 54 +
.../org/apache/crunch/types/PGroupedTableType.java | 141 +
.../java/org/apache/crunch/types/PTableType.java | 44 +
.../main/java/org/apache/crunch/types/PType.java | 86 +
.../java/org/apache/crunch/types/PTypeFamily.java | 77 +
.../java/org/apache/crunch/types/PTypeUtils.java | 66 +
.../main/java/org/apache/crunch/types/PTypes.java | 252 +
.../main/java/org/apache/crunch/types/Protos.java | 173 +
.../org/apache/crunch/types/TupleDeepCopier.java | 65 +
.../java/org/apache/crunch/types/TupleFactory.java | 134 +
.../apache/crunch/types/avro/AvroCapabilities.java | 106 +
.../apache/crunch/types/avro/AvroDeepCopier.java | 209 +
.../crunch/types/avro/AvroGroupedTableType.java | 114 +
.../apache/crunch/types/avro/AvroInputFormat.java | 41 +
.../apache/crunch/types/avro/AvroKeyConverter.java | 65 +
.../apache/crunch/types/avro/AvroOutputFormat.java | 87 +
.../crunch/types/avro/AvroPairConverter.java | 108 +
.../apache/crunch/types/avro/AvroRecordReader.java | 114 +
.../apache/crunch/types/avro/AvroTableType.java | 151 +
.../crunch/types/avro/AvroTextOutputFormat.java | 60 +
.../org/apache/crunch/types/avro/AvroType.java | 199 +
.../apache/crunch/types/avro/AvroTypeFamily.java | 164 +
.../crunch/types/avro/AvroUtf8InputFormat.java | 98 +
.../java/org/apache/crunch/types/avro/Avros.java | 709 +
.../crunch/types/avro/ReflectDataFactory.java | 41 +
.../crunch/types/avro/SafeAvroSerialization.java | 145 +
.../org/apache/crunch/types/avro/package-info.java | 22 +
.../java/org/apache/crunch/types/package-info.java | 22 +
.../types/writable/GenericArrayWritable.java | 135 +
.../crunch/types/writable/TextMapWritable.java | 88 +
.../crunch/types/writable/TupleWritable.java | 224 +
.../crunch/types/writable/WritableDeepCopier.java | 70 +
.../types/writable/WritableGroupedTableType.java | 85 +
.../types/writable/WritablePairConverter.java | 62 +
.../crunch/types/writable/WritableTableType.java | 130 +
.../apache/crunch/types/writable/WritableType.java | 133 +
.../crunch/types/writable/WritableTypeFamily.java | 147 +
.../types/writable/WritableValueConverter.java | 60 +
.../apache/crunch/types/writable/Writables.java | 588 +
.../apache/crunch/types/writable/package-info.java | 22 +
.../java/org/apache/crunch/util/CrunchTool.java | 118 +
.../java/org/apache/crunch/util/DistCache.java | 231 +
.../org/apache/crunch/util/PartitionUtils.java | 34 +
.../main/java/org/apache/crunch/util/Tuples.java | 150 +
.../java/org/apache/crunch/util/package-info.java | 22 +
crunch-core/src/main/resources/log4j.properties | 24 +
crunch-core/src/site/site.xml | 34 +
crunch-core/src/test/avro/employee.avsc | 26 +
crunch-core/src/test/avro/person.avsc | 26 +
.../src/test/java/org/apache/crunch/AndFnTest.java | 77 +
.../test/java/org/apache/crunch/CombineFnTest.java | 222 +
.../src/test/java/org/apache/crunch/NotFnTest.java | 72 +
.../src/test/java/org/apache/crunch/OrFnTest.java | 78 +
.../src/test/java/org/apache/crunch/PairTest.java | 66 +
.../src/test/java/org/apache/crunch/TupleTest.java | 139 +
.../test/java/org/apache/crunch/WriteModeTest.java | 103 +
.../java/org/apache/crunch/fn/AggregatorsTest.java | 239 +
.../org/apache/crunch/fn/ExtractKeyFnTest.java | 44 +
.../java/org/apache/crunch/fn/FilterFnTest.java | 85 +
.../java/org/apache/crunch/fn/MapKeysTest.java | 51 +
.../java/org/apache/crunch/fn/MapValuesTest.java | 50 +
.../java/org/apache/crunch/fn/PairMapTest.java | 52 +
.../org/apache/crunch/fn/StoreLastEmitter.java | 41 +
.../apache/crunch/impl/SingleUseIterableTest.java | 54 +
.../org/apache/crunch/impl/mr/MRPipelineTest.java | 86 +
.../impl/mr/collect/DoCollectionImplTest.java | 112 +
.../crunch/impl/mr/collect/DoTableImplTest.java | 86 +
.../impl/mr/emit/IntermediateEmitterTest.java | 83 +
.../impl/mr/exec/CappedExponentialCounterTest.java | 42 +
.../crunch/impl/mr/exec/CrunchJobHooksTest.java | 42 +
.../crunch/impl/mr/plan/DotfileWriterTest.java | 132 +
.../crunch/impl/mr/plan/JobNameBuilderTest.java | 41 +
.../crunch/io/SequentialFileNamingSchemeTest.java | 84 +
.../apache/crunch/io/SourceTargetHelperTest.java | 59 +
.../crunch/io/avro/AvroFileReaderFactoryTest.java | 184 +
.../apache/crunch/io/avro/AvroFileSourceTest.java | 91 +
.../lib/AvroIndexedRecordPartitionerTest.java | 98 +
.../java/org/apache/crunch/lib/CartesianTest.java | 77 +
.../java/org/apache/crunch/lib/DistinctTest.java | 52 +
.../java/org/apache/crunch/lib/SampleTest.java | 71 +
.../org/apache/crunch/lib/SecondarySortTest.java | 53 +
.../crunch/lib/TupleWritablePartitionerTest.java | 68 +
.../lib/join/BrokenLeftAndOuterJoinTest.java | 90 +
.../crunch/lib/join/FullOuterJoinFnTest.java | 48 +
.../apache/crunch/lib/join/InnerJoinFnTest.java | 42 +
.../org/apache/crunch/lib/join/JoinFnTestBase.java | 82 +
.../apache/crunch/lib/join/LeftOuterJoinTest.java | 46 +
.../crunch/lib/join/RightOuterJoinFnTest.java | 46 +
.../java/org/apache/crunch/test/CountersTest.java | 70 +
.../java/org/apache/crunch/test/StringWrapper.java | 102 +
.../crunch/types/CollectionDeepCopierTest.java | 61 +
.../org/apache/crunch/types/MapDeepCopierTest.java | 63 +
.../org/apache/crunch/types/PTypeUtilsTest.java | 89 +
.../java/org/apache/crunch/types/PTypesTest.java | 34 +
.../apache/crunch/types/TupleDeepCopierTest.java | 77 +
.../org/apache/crunch/types/TupleFactoryTest.java | 69 +
.../crunch/types/avro/AvroDeepCopierTest.java | 107 +
.../types/avro/AvroGroupedTableTypeTest.java | 60 +
.../crunch/types/avro/AvroTableTypeTest.java | 72 +
.../org/apache/crunch/types/avro/AvroTypeTest.java | 279 +
.../org/apache/crunch/types/avro/AvrosTest.java | 325 +
.../types/writable/GenericArrayWritableTest.java | 70 +
.../types/writable/WritableDeepCopierTest.java | 54 +
.../writable/WritableGroupedTableTypeTest.java | 56 +
.../types/writable/WritableTableTypeTest.java | 47 +
.../crunch/types/writable/WritableTypeTest.java | 97 +
.../crunch/types/writable/WritablesTest.java | 256 +
.../java/org/apache/crunch/util/DistCacheTest.java | 156 +
crunch-dist/pom.xml | 2 +-
crunch-examples/pom.xml | 2 +-
crunch-hbase/pom.xml | 2 +-
crunch-scrunch/pom.xml | 2 +-
crunch/pom.xml | 182 -
.../it/java/org/apache/crunch/CancelJobsIT.java | 84 -
.../src/it/java/org/apache/crunch/CleanTextIT.java | 82 -
.../org/apache/crunch/CollectionPObjectIT.java | 98 -
.../it/java/org/apache/crunch/CollectionsIT.java | 117 -
.../org/apache/crunch/CollectionsLengthIT.java | 70 -
.../org/apache/crunch/DeepCopyCustomTuplesIT.java | 79 -
.../src/it/java/org/apache/crunch/EnumPairIT.java | 59 -
.../org/apache/crunch/FirstElementPObjectIT.java | 61 -
.../apache/crunch/IterableReuseProtectionIT.java | 89 -
.../it/java/org/apache/crunch/MRPipelineIT.java | 78 -
.../it/java/org/apache/crunch/MapPObjectIT.java | 101 -
crunch/src/it/java/org/apache/crunch/MapsIT.java | 101 -
.../it/java/org/apache/crunch/MaterializeIT.java | 139 -
.../java/org/apache/crunch/MaterializeToMapIT.java | 81 -
.../java/org/apache/crunch/MultipleOutputIT.java | 175 -
.../org/apache/crunch/PCollectionGetSizeIT.java | 151 -
.../src/it/java/org/apache/crunch/PObjectsIT.java | 99 -
.../java/org/apache/crunch/PTableKeyValueIT.java | 103 -
.../src/it/java/org/apache/crunch/PageRankIT.java | 168 -
.../org/apache/crunch/StageResultsCountersIT.java | 135 -
.../it/java/org/apache/crunch/TermFrequencyIT.java | 135 -
.../src/it/java/org/apache/crunch/TextPairIT.java | 72 -
crunch/src/it/java/org/apache/crunch/TfIdfIT.java | 224 -
.../org/apache/crunch/TupleNClassCastBugIT.java | 95 -
.../org/apache/crunch/UnionFromSameSourceIT.java | 132 -
crunch/src/it/java/org/apache/crunch/UnionIT.java | 136 -
.../it/java/org/apache/crunch/UnionResultsIT.java | 80 -
.../src/it/java/org/apache/crunch/WordCountIT.java | 171 -
.../java/org/apache/crunch/fn/AggregatorsIT.java | 83 -
.../crunch/impl/mem/MemPipelineFileWritingIT.java | 58 -
.../crunch/impl/mr/collect/UnionCollectionIT.java | 154 -
.../apache/crunch/io/CompositePathIterableIT.java | 84 -
.../it/java/org/apache/crunch/io/NLineInputIT.java | 72 -
.../java/org/apache/crunch/io/TextFileTableIT.java | 56 -
.../crunch/io/avro/AvroFileSourceTargetIT.java | 140 -
.../org/apache/crunch/io/avro/AvroPipelineIT.java | 95 -
.../org/apache/crunch/io/avro/AvroReflectIT.java | 109 -
.../org/apache/crunch/io/avro/AvroWritableIT.java | 89 -
.../it/java/org/apache/crunch/lib/AggregateIT.java | 231 -
.../java/org/apache/crunch/lib/AvroTypeSortIT.java | 145 -
.../it/java/org/apache/crunch/lib/CogroupIT.java | 112 -
.../org/apache/crunch/lib/SecondarySortIT.java | 65 -
.../src/it/java/org/apache/crunch/lib/SetIT.java | 114 -
.../java/org/apache/crunch/lib/SortByValueIT.java | 84 -
.../src/it/java/org/apache/crunch/lib/SortIT.java | 327 -
.../apache/crunch/lib/SpecificAvroGroupByIT.java | 119 -
.../apache/crunch/lib/join/FullOuterJoinIT.java | 51 -
.../org/apache/crunch/lib/join/InnerJoinIT.java | 51 -
.../org/apache/crunch/lib/join/JoinTester.java | 108 -
.../apache/crunch/lib/join/LeftOuterJoinIT.java | 51 -
.../org/apache/crunch/lib/join/MapsideJoinIT.java | 158 -
.../crunch/lib/join/MultiAvroSchemaJoinIT.java | 121 -
.../apache/crunch/lib/join/RightOuterJoinIT.java | 51 -
.../org/apache/crunch/test/TemporaryPaths.java | 40 -
.../src/it/java/org/apache/crunch/test/Tests.java | 124 -
crunch/src/it/resources/customers.txt | 4 -
crunch/src/it/resources/docs.txt | 6 -
crunch/src/it/resources/letters.txt | 2 -
crunch/src/it/resources/log4j.properties | 29 -
crunch/src/it/resources/maugham.txt |29112 ---------------
crunch/src/it/resources/orders.txt | 4 -
.../org/apache/crunch/UnionITData/src1.txt | 5 -
.../org/apache/crunch/UnionITData/src2.txt | 3 -
.../apache/crunch/fn/AggregatorsITData/ints.txt | 5 -
.../org/apache/crunch/lib/CogroupITData/src1.txt | 4 -
.../org/apache/crunch/lib/CogroupITData/src2.txt | 4 -
crunch/src/it/resources/secondary_sort_input.txt | 7 -
crunch/src/it/resources/set1.txt | 4 -
crunch/src/it/resources/set2.txt | 3 -
crunch/src/it/resources/shakes.txt | 3667 --
crunch/src/it/resources/sort_by_value.txt | 5 -
crunch/src/it/resources/urls.txt | 11 -
.../main/java/org/apache/crunch/Aggregator.java | 86 -
.../src/main/java/org/apache/crunch/CombineFn.java | 1211 -
.../org/apache/crunch/CrunchRuntimeException.java | 54 -
crunch/src/main/java/org/apache/crunch/DoFn.java | 162 -
.../src/main/java/org/apache/crunch/Emitter.java | 37 -
.../src/main/java/org/apache/crunch/FilterFn.java | 244 -
.../java/org/apache/crunch/GroupingOptions.java | 167 -
crunch/src/main/java/org/apache/crunch/MapFn.java | 41 -
.../main/java/org/apache/crunch/PCollection.java | 245 -
.../main/java/org/apache/crunch/PGroupedTable.java | 53 -
.../src/main/java/org/apache/crunch/PObject.java | 36 -
crunch/src/main/java/org/apache/crunch/PTable.java | 181 -
crunch/src/main/java/org/apache/crunch/Pair.java | 105 -
.../java/org/apache/crunch/ParallelDoOptions.java | 62 -
.../src/main/java/org/apache/crunch/Pipeline.java | 138 -
.../java/org/apache/crunch/PipelineExecution.java | 54 -
.../java/org/apache/crunch/PipelineResult.java | 76 -
crunch/src/main/java/org/apache/crunch/Source.java | 52 -
.../main/java/org/apache/crunch/SourceTarget.java | 26 -
.../main/java/org/apache/crunch/TableSource.java | 28 -
.../java/org/apache/crunch/TableSourceTarget.java | 25 -
crunch/src/main/java/org/apache/crunch/Target.java | 83 -
crunch/src/main/java/org/apache/crunch/Tuple.java | 36 -
crunch/src/main/java/org/apache/crunch/Tuple3.java | 96 -
crunch/src/main/java/org/apache/crunch/Tuple4.java | 105 -
crunch/src/main/java/org/apache/crunch/TupleN.java | 73 -
.../java/org/apache/crunch/fn/Aggregators.java | 1111 -
.../java/org/apache/crunch/fn/CompositeMapFn.java | 71 -
.../java/org/apache/crunch/fn/ExtractKeyFn.java | 50 -
.../main/java/org/apache/crunch/fn/FilterFns.java | 112 -
.../main/java/org/apache/crunch/fn/IdentityFn.java | 39 -
.../main/java/org/apache/crunch/fn/MapKeysFn.java | 32 -
.../java/org/apache/crunch/fn/MapValuesFn.java | 32 -
.../main/java/org/apache/crunch/fn/PairMapFn.java | 65 -
.../java/org/apache/crunch/fn/package-info.java | 22 -
.../mapreduce/TaskAttemptContextFactory.java | 70 -
.../lib/jobcontrol/CrunchControlledJob.java | 325 -
.../mapreduce/lib/jobcontrol/CrunchJobControl.java | 211 -
.../org/apache/crunch/impl/SingleUseIterable.java | 49 -
.../org/apache/crunch/impl/mem/MemPipeline.java | 275 -
.../crunch/impl/mem/collect/MemCollection.java | 295 -
.../crunch/impl/mem/collect/MemGroupedTable.java | 113 -
.../apache/crunch/impl/mem/collect/MemTable.java | 177 -
.../apache/crunch/impl/mem/collect/Shuffler.java | 149 -
.../crunch/impl/mem/emit/InMemoryEmitter.java | 57 -
.../org/apache/crunch/impl/mem/package-info.java | 22 -
.../java/org/apache/crunch/impl/mr/MRPipeline.java | 396 -
.../crunch/impl/mr/collect/DoCollectionImpl.java | 74 -
.../apache/crunch/impl/mr/collect/DoTableImpl.java | 84 -
.../crunch/impl/mr/collect/InputCollection.java | 85 -
.../apache/crunch/impl/mr/collect/InputTable.java | 86 -
.../crunch/impl/mr/collect/PCollectionImpl.java | 295 -
.../crunch/impl/mr/collect/PGroupedTableImpl.java | 144 -
.../apache/crunch/impl/mr/collect/PTableBase.java | 169 -
.../crunch/impl/mr/collect/UnionCollection.java | 80 -
.../apache/crunch/impl/mr/collect/UnionTable.java | 92 -
.../crunch/impl/mr/emit/IntermediateEmitter.java | 64 -
.../crunch/impl/mr/emit/MultipleOutputEmitter.java | 56 -
.../apache/crunch/impl/mr/emit/OutputEmitter.java | 52 -
.../impl/mr/exec/CappedExponentialCounter.java | 40 -
.../apache/crunch/impl/mr/exec/CrunchJobHooks.java | 153 -
.../org/apache/crunch/impl/mr/exec/MRExecutor.java | 198 -
.../org/apache/crunch/impl/mr/package-info.java | 22 -
.../org/apache/crunch/impl/mr/plan/DoNode.java | 163 -
.../apache/crunch/impl/mr/plan/DotfileWriter.java | 238 -
.../java/org/apache/crunch/impl/mr/plan/Edge.java | 125 -
.../java/org/apache/crunch/impl/mr/plan/Graph.java | 133 -
.../apache/crunch/impl/mr/plan/GraphBuilder.java | 92 -
.../apache/crunch/impl/mr/plan/JobNameBuilder.java | 79 -
.../apache/crunch/impl/mr/plan/JobPrototype.java | 245 -
.../crunch/impl/mr/plan/MSCROutputHandler.java | 77 -
.../apache/crunch/impl/mr/plan/MSCRPlanner.java | 378 -
.../org/apache/crunch/impl/mr/plan/NodePath.java | 124 -
.../crunch/impl/mr/plan/PlanningParameters.java | 38 -
.../org/apache/crunch/impl/mr/plan/Vertex.java | 126 -
.../apache/crunch/impl/mr/run/CrunchCombiner.java | 27 -
.../crunch/impl/mr/run/CrunchInputFormat.java | 78 -
.../crunch/impl/mr/run/CrunchInputSplit.java | 116 -
.../apache/crunch/impl/mr/run/CrunchMapper.java | 73 -
.../crunch/impl/mr/run/CrunchRecordReader.java | 75 -
.../apache/crunch/impl/mr/run/CrunchReducer.java | 73 -
.../crunch/impl/mr/run/CrunchTaskContext.java | 86 -
.../org/apache/crunch/impl/mr/run/NodeContext.java | 35 -
.../java/org/apache/crunch/impl/mr/run/RTNode.java | 124 -
.../crunch/impl/mr/run/RuntimeParameters.java | 38 -
crunch/src/main/java/org/apache/crunch/io/At.java | 257 -
.../apache/crunch/io/CompositePathIterable.java | 102 -
.../java/org/apache/crunch/io/CrunchInputs.java | 71 -
.../java/org/apache/crunch/io/CrunchOutputs.java | 184 -
.../org/apache/crunch/io/FileNamingScheme.java | 58 -
.../org/apache/crunch/io/FileReaderFactory.java | 27 -
.../java/org/apache/crunch/io/FormatBundle.java | 121 -
.../src/main/java/org/apache/crunch/io/From.java | 324 -
.../java/org/apache/crunch/io/MapReduceTarget.java | 27 -
.../java/org/apache/crunch/io/OutputHandler.java | 25 -
.../main/java/org/apache/crunch/io/PathTarget.java | 36 -
.../java/org/apache/crunch/io/PathTargetImpl.java | 64 -
.../java/org/apache/crunch/io/ReadableSource.java | 41 -
.../org/apache/crunch/io/ReadableSourceTarget.java | 30 -
.../crunch/io/SequentialFileNamingScheme.java | 51 -
.../org/apache/crunch/io/SourceTargetHelper.java | 48 -
crunch/src/main/java/org/apache/crunch/io/To.java | 153 -
.../crunch/io/avro/AvroFileReaderFactory.java | 96 -
.../org/apache/crunch/io/avro/AvroFileSource.java | 58 -
.../crunch/io/avro/AvroFileSourceTarget.java | 39 -
.../org/apache/crunch/io/avro/AvroFileTarget.java | 91 -
.../apache/crunch/io/impl/AutoClosingIterator.java | 62 -
.../org/apache/crunch/io/impl/FileSourceImpl.java | 104 -
.../apache/crunch/io/impl/FileTableSourceImpl.java | 41 -
.../org/apache/crunch/io/impl/FileTargetImpl.java | 162 -
.../io/impl/ReadableSourcePathTargetImpl.java | 39 -
.../crunch/io/impl/ReadableSourceTargetImpl.java | 37 -
.../crunch/io/impl/SourcePathTargetImpl.java | 50 -
.../apache/crunch/io/impl/SourceTargetImpl.java | 89 -
.../crunch/io/impl/TableSourcePathTargetImpl.java | 41 -
.../crunch/io/impl/TableSourceTargetImpl.java | 35 -
.../java/org/apache/crunch/io/package-info.java | 22 -
.../org/apache/crunch/io/seq/SeqFileHelper.java | 35 -
.../apache/crunch/io/seq/SeqFileReaderFactory.java | 112 -
.../org/apache/crunch/io/seq/SeqFileSource.java | 47 -
.../apache/crunch/io/seq/SeqFileSourceTarget.java | 44 -
.../apache/crunch/io/seq/SeqFileTableSource.java | 57 -
.../crunch/io/seq/SeqFileTableSourceTarget.java | 54 -
.../org/apache/crunch/io/seq/SeqFileTarget.java | 55 -
.../crunch/io/text/BZip2TextInputFormat.java | 235 -
.../apache/crunch/io/text/CBZip2InputStream.java | 980 -
.../java/org/apache/crunch/io/text/LineParser.java | 125 -
.../org/apache/crunch/io/text/NLineFileSource.java | 77 -
.../crunch/io/text/TextFileReaderFactory.java | 83 -
.../org/apache/crunch/io/text/TextFileSource.java | 73 -
.../crunch/io/text/TextFileSourceTarget.java | 44 -
.../apache/crunch/io/text/TextFileTableSource.java | 81 -
.../crunch/io/text/TextFileTableSourceTarget.java | 63 -
.../org/apache/crunch/io/text/TextFileTarget.java | 109 -
.../main/java/org/apache/crunch/lib/Aggregate.java | 272 -
.../main/java/org/apache/crunch/lib/Cartesian.java | 216 -
.../main/java/org/apache/crunch/lib/Cogroup.java | 106 -
.../main/java/org/apache/crunch/lib/Distinct.java | 126 -
.../src/main/java/org/apache/crunch/lib/Join.java | 181 -
.../main/java/org/apache/crunch/lib/PTables.java | 117 -
.../main/java/org/apache/crunch/lib/Sample.java | 217 -
.../java/org/apache/crunch/lib/SampleUtils.java | 168 -
.../java/org/apache/crunch/lib/SecondarySort.java | 118 -
.../src/main/java/org/apache/crunch/lib/Set.java | 118 -
.../src/main/java/org/apache/crunch/lib/Sort.java | 294 -
.../apache/crunch/lib/join/FullOuterJoinFn.java | 102 -
.../org/apache/crunch/lib/join/InnerJoinFn.java | 78 -
.../java/org/apache/crunch/lib/join/JoinFn.java | 81 -
.../java/org/apache/crunch/lib/join/JoinUtils.java | 126 -
.../apache/crunch/lib/join/LeftOuterJoinFn.java | 98 -
.../org/apache/crunch/lib/join/MapsideJoin.java | 164 -
.../apache/crunch/lib/join/RightOuterJoinFn.java | 83 -
.../org/apache/crunch/lib/join/package-info.java | 22 -
.../java/org/apache/crunch/lib/package-info.java | 22 -
.../org/apache/crunch/lib/sort/Comparators.java | 187 -
.../java/org/apache/crunch/lib/sort/SortFns.java | 210 -
.../crunch/lib/sort/TotalOrderPartitioner.java | 145 -
.../crunch/materialize/MaterializableIterable.java | 81 -
.../crunch/materialize/MaterializableMap.java | 50 -
.../materialize/pobject/CollectionPObject.java | 55 -
.../materialize/pobject/FirstElementPObject.java | 50 -
.../crunch/materialize/pobject/MapPObject.java | 62 -
.../crunch/materialize/pobject/PObjectImpl.java | 85 -
.../main/java/org/apache/crunch/package-info.java | 25 -
.../apache/crunch/types/CollectionDeepCopier.java | 57 -
.../java/org/apache/crunch/types/Converter.java | 41 -
.../java/org/apache/crunch/types/DeepCopier.java | 60 -
.../org/apache/crunch/types/MapDeepCopier.java | 54 -
.../org/apache/crunch/types/PGroupedTableType.java | 141 -
.../java/org/apache/crunch/types/PTableType.java | 44 -
.../main/java/org/apache/crunch/types/PType.java | 86 -
.../java/org/apache/crunch/types/PTypeFamily.java | 77 -
.../java/org/apache/crunch/types/PTypeUtils.java | 66 -
.../main/java/org/apache/crunch/types/PTypes.java | 252 -
.../main/java/org/apache/crunch/types/Protos.java | 173 -
.../org/apache/crunch/types/TupleDeepCopier.java | 65 -
.../java/org/apache/crunch/types/TupleFactory.java | 134 -
.../apache/crunch/types/avro/AvroCapabilities.java | 106 -
.../apache/crunch/types/avro/AvroDeepCopier.java | 209 -
.../crunch/types/avro/AvroGroupedTableType.java | 114 -
.../apache/crunch/types/avro/AvroInputFormat.java | 41 -
.../apache/crunch/types/avro/AvroKeyConverter.java | 65 -
.../apache/crunch/types/avro/AvroOutputFormat.java | 87 -
.../crunch/types/avro/AvroPairConverter.java | 108 -
.../apache/crunch/types/avro/AvroRecordReader.java | 114 -
.../apache/crunch/types/avro/AvroTableType.java | 151 -
.../crunch/types/avro/AvroTextOutputFormat.java | 60 -
.../org/apache/crunch/types/avro/AvroType.java | 199 -
.../apache/crunch/types/avro/AvroTypeFamily.java | 164 -
.../crunch/types/avro/AvroUtf8InputFormat.java | 98 -
.../java/org/apache/crunch/types/avro/Avros.java | 709 -
.../crunch/types/avro/ReflectDataFactory.java | 41 -
.../crunch/types/avro/SafeAvroSerialization.java | 145 -
.../org/apache/crunch/types/avro/package-info.java | 22 -
.../java/org/apache/crunch/types/package-info.java | 22 -
.../types/writable/GenericArrayWritable.java | 135 -
.../crunch/types/writable/TextMapWritable.java | 88 -
.../crunch/types/writable/TupleWritable.java | 224 -
.../crunch/types/writable/WritableDeepCopier.java | 70 -
.../types/writable/WritableGroupedTableType.java | 85 -
.../types/writable/WritablePairConverter.java | 62 -
.../crunch/types/writable/WritableTableType.java | 130 -
.../apache/crunch/types/writable/WritableType.java | 133 -
.../crunch/types/writable/WritableTypeFamily.java | 147 -
.../types/writable/WritableValueConverter.java | 60 -
.../apache/crunch/types/writable/Writables.java | 588 -
.../apache/crunch/types/writable/package-info.java | 22 -
.../java/org/apache/crunch/util/CrunchTool.java | 118 -
.../java/org/apache/crunch/util/DistCache.java | 231 -
.../org/apache/crunch/util/PartitionUtils.java | 34 -
.../main/java/org/apache/crunch/util/Tuples.java | 150 -
.../java/org/apache/crunch/util/package-info.java | 22 -
crunch/src/main/resources/log4j.properties | 24 -
crunch/src/site/site.xml | 34 -
crunch/src/test/avro/employee.avsc | 26 -
crunch/src/test/avro/person.avsc | 26 -
.../src/test/java/org/apache/crunch/AndFnTest.java | 77 -
.../test/java/org/apache/crunch/CombineFnTest.java | 222 -
.../src/test/java/org/apache/crunch/NotFnTest.java | 72 -
.../src/test/java/org/apache/crunch/OrFnTest.java | 78 -
.../src/test/java/org/apache/crunch/PairTest.java | 66 -
.../src/test/java/org/apache/crunch/TupleTest.java | 139 -
.../test/java/org/apache/crunch/WriteModeTest.java | 103 -
.../java/org/apache/crunch/fn/AggregatorsTest.java | 239 -
.../org/apache/crunch/fn/ExtractKeyFnTest.java | 44 -
.../java/org/apache/crunch/fn/FilterFnTest.java | 85 -
.../java/org/apache/crunch/fn/MapKeysTest.java | 51 -
.../java/org/apache/crunch/fn/MapValuesTest.java | 50 -
.../java/org/apache/crunch/fn/PairMapTest.java | 52 -
.../org/apache/crunch/fn/StoreLastEmitter.java | 41 -
.../apache/crunch/impl/SingleUseIterableTest.java | 54 -
.../org/apache/crunch/impl/mr/MRPipelineTest.java | 86 -
.../impl/mr/collect/DoCollectionImplTest.java | 112 -
.../crunch/impl/mr/collect/DoTableImplTest.java | 86 -
.../impl/mr/emit/IntermediateEmitterTest.java | 83 -
.../impl/mr/exec/CappedExponentialCounterTest.java | 42 -
.../crunch/impl/mr/exec/CrunchJobHooksTest.java | 42 -
.../crunch/impl/mr/plan/DotfileWriterTest.java | 132 -
.../crunch/impl/mr/plan/JobNameBuilderTest.java | 41 -
.../crunch/io/SequentialFileNamingSchemeTest.java | 84 -
.../apache/crunch/io/SourceTargetHelperTest.java | 59 -
.../crunch/io/avro/AvroFileReaderFactoryTest.java | 184 -
.../apache/crunch/io/avro/AvroFileSourceTest.java | 91 -
.../lib/AvroIndexedRecordPartitionerTest.java | 98 -
.../java/org/apache/crunch/lib/CartesianTest.java | 77 -
.../java/org/apache/crunch/lib/DistinctTest.java | 52 -
.../java/org/apache/crunch/lib/SampleTest.java | 71 -
.../org/apache/crunch/lib/SecondarySortTest.java | 53 -
.../crunch/lib/TupleWritablePartitionerTest.java | 68 -
.../lib/join/BrokenLeftAndOuterJoinTest.java | 90 -
.../crunch/lib/join/FullOuterJoinFnTest.java | 48 -
.../apache/crunch/lib/join/InnerJoinFnTest.java | 42 -
.../org/apache/crunch/lib/join/JoinFnTestBase.java | 82 -
.../apache/crunch/lib/join/LeftOuterJoinTest.java | 46 -
.../crunch/lib/join/RightOuterJoinFnTest.java | 46 -
.../java/org/apache/crunch/test/CountersTest.java | 70 -
.../java/org/apache/crunch/test/StringWrapper.java | 102 -
.../crunch/types/CollectionDeepCopierTest.java | 61 -
.../org/apache/crunch/types/MapDeepCopierTest.java | 63 -
.../org/apache/crunch/types/PTypeUtilsTest.java | 89 -
.../java/org/apache/crunch/types/PTypesTest.java | 34 -
.../apache/crunch/types/TupleDeepCopierTest.java | 77 -
.../org/apache/crunch/types/TupleFactoryTest.java | 69 -
.../crunch/types/avro/AvroDeepCopierTest.java | 107 -
.../types/avro/AvroGroupedTableTypeTest.java | 60 -
.../crunch/types/avro/AvroTableTypeTest.java | 72 -
.../org/apache/crunch/types/avro/AvroTypeTest.java | 279 -
.../org/apache/crunch/types/avro/AvrosTest.java | 325 -
.../types/writable/GenericArrayWritableTest.java | 70 -
.../types/writable/WritableDeepCopierTest.java | 54 -
.../writable/WritableGroupedTableTypeTest.java | 56 -
.../types/writable/WritableTableTypeTest.java | 47 -
.../crunch/types/writable/WritableTypeTest.java | 97 -
.../crunch/types/writable/WritablesTest.java | 256 -
.../java/org/apache/crunch/util/DistCacheTest.java | 156 -
pom.xml | 4 +-
702 files changed, 70421 insertions(+), 70421 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-contrib/pom.xml
----------------------------------------------------------------------
diff --git a/crunch-contrib/pom.xml b/crunch-contrib/pom.xml
index 12f9a13..e5a35c5 100644
--- a/crunch-contrib/pom.xml
+++ b/crunch-contrib/pom.xml
@@ -32,7 +32,7 @@ under the License.
<dependency>
<groupId>org.apache.crunch</groupId>
- <artifactId>crunch</artifactId>
+ <artifactId>crunch-core</artifactId>
</dependency>
<dependency>
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/pom.xml
----------------------------------------------------------------------
diff --git a/crunch-core/pom.xml b/crunch-core/pom.xml
new file mode 100644
index 0000000..d365c3d
--- /dev/null
+++ b/crunch-core/pom.xml
@@ -0,0 +1,182 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.crunch</groupId>
+ <artifactId>crunch-parent</artifactId>
+ <version>0.6.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>crunch-core</artifactId>
+ <name>Apache Crunch Core</name>
+
+ <dependencies>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.avro</groupId>
+ <artifactId>avro</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.avro</groupId>
+ <artifactId>avro-mapred</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.javassist</groupId>
+ <artifactId>javassist</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-client</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- Override the slf4j dependency from Avro, which is incompatible with
+ Hadoop's. -->
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-core-asl</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-mapper-asl</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- Both Protobufs and Thrift are supported as
+ derived serialization types, and you can use
+ (almost) any version of them you like, Crunch
+ only relies on the stable public APIs, not the
+ structure of the files themselves.
+
+ Both dependencies are scoped as provided, in
+ order to not expand the size of the assembly jars
+ unnecessarily.
+ -->
+
+ <dependency>
+ <groupId>com.google.protobuf</groupId>
+ <artifactId>protobuf-java</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.thrift</groupId>
+ <artifactId>libthrift</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- Used by LocalJobRunner in integration tests -->
+ <dependency>
+ <groupId>commons-httpclient</groupId>
+ <artifactId>commons-httpclient</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.crunch</groupId>
+ <artifactId>crunch-test</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-all</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.hamcrest</groupId>
+ <artifactId>hamcrest-all</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>build-helper-maven-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.avro</groupId>
+ <artifactId>avro-maven-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>schemas</id>
+ <phase>generate-sources</phase>
+ <goals>
+ <goal>schema</goal>
+ </goals>
+ <configuration>
+ <testSourceDirectory>${project.basedir}/src/test/avro/</testSourceDirectory>
+ <testOutputDirectory>target/generated-test-sources/</testOutputDirectory>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/CancelJobsIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/CancelJobsIT.java b/crunch-core/src/it/java/org/apache/crunch/CancelJobsIT.java
new file mode 100644
index 0000000..ff01a2f
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/CancelJobsIT.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.To;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.junit.Rule;
+import org.junit.Test;
+
+/**
+ *
+ */
+public class CancelJobsIT {
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testRun() throws Exception {
+ PipelineExecution pe = run();
+ pe.waitUntilDone();
+ PipelineResult pr = pe.getResult();
+ assertEquals(PipelineExecution.Status.SUCCEEDED, pe.getStatus());
+ assertEquals(2, pr.getStageResults().size());
+ }
+
+ @Test
+ public void testKill() throws Exception {
+ PipelineExecution pe = run();
+ pe.kill();
+ pe.waitUntilDone();
+ assertEquals(PipelineExecution.Status.KILLED, pe.getStatus());
+ }
+
+ @Test
+ public void testKillMultipleTimes() throws Exception {
+ PipelineExecution pe = run();
+ for (int i = 0; i < 10; i++) {
+ pe.kill();
+ }
+ pe.waitUntilDone();
+ assertEquals(PipelineExecution.Status.KILLED, pe.getStatus());
+ }
+
+ @Test
+ public void testKillAfterDone() throws Exception {
+ PipelineExecution pe = run();
+ pe.waitUntilDone();
+ assertEquals(PipelineExecution.Status.SUCCEEDED, pe.getStatus());
+ pe.kill(); // expect no-op
+ assertEquals(PipelineExecution.Status.SUCCEEDED, pe.getStatus());
+ }
+
+ public PipelineExecution run() throws IOException {
+ String shakes = tmpDir.copyResourceFileName("shakes.txt");
+ String out = tmpDir.getFileName("cancel");
+ Pipeline p = new MRPipeline(CancelJobsIT.class, tmpDir.getDefaultConfiguration());
+ PCollection<String> words = p.readTextFile(shakes);
+ p.write(words.count().top(20), To.textFile(out));
+ return p.runAsync(); // need to hack to slow down job start up if this test becomes flaky.
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java b/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java
new file mode 100644
index 0000000..2f4004e
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/CleanTextIT.java
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.nio.charset.Charset;
+import java.util.List;
+
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.To;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.avro.Avros;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.io.Files;
+
+/**
+ *
+ */
+public class CleanTextIT {
+
+ private static final int LINES_IN_SHAKES = 3667;
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ static DoFn<String, String> CLEANER = new DoFn<String, String>() {
+ @Override
+ public void process(String input, Emitter<String> emitter) {
+ emitter.emit(input.toLowerCase());
+ }
+ };
+
+ static DoFn<String, String> SPLIT = new DoFn<String, String>() {
+ @Override
+ public void process(String input, Emitter<String> emitter) {
+ for (String word : input.split("\\S+")) {
+ if (!word.isEmpty()) {
+ emitter.emit(word);
+ }
+ }
+ }
+ };
+
+ @Test
+ public void testMapSideOutputs() throws Exception {
+ Pipeline pipeline = new MRPipeline(CleanTextIT.class, tmpDir.getDefaultConfiguration());
+ String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
+ PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
+
+ PCollection<String> cleanShakes = shakespeare.parallelDo(CLEANER, Avros.strings());
+ File cso = tmpDir.getFile("cleanShakes");
+ cleanShakes.write(To.textFile(cso.getAbsolutePath()));
+
+ File wc = tmpDir.getFile("wordCounts");
+ cleanShakes.parallelDo(SPLIT, Avros.strings()).count().write(To.textFile(wc.getAbsolutePath()));
+ pipeline.done();
+
+ File cleanFile = new File(cso, "part-m-00000");
+ List<String> lines = Files.readLines(cleanFile, Charset.defaultCharset());
+ assertEquals(LINES_IN_SHAKES, lines.size());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java b/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java
new file mode 100644
index 0000000..7e0c75c
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/CollectionPObjectIT.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.lang.String;
+import java.util.Collection;
+
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PObject;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.materialize.pobject.CollectionPObject;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.junit.Rule;
+import org.junit.Test;
+
+@SuppressWarnings("serial")
+public class CollectionPObjectIT {
+
+ private static final int LINES_IN_SHAKES = 3667;
+
+ private static final String FIRST_SHAKESPEARE_LINE =
+ "***The Project Gutenberg's Etext of Shakespeare's First Folio***";
+
+ private static final String LAST_SHAKESPEARE_LINE =
+ "FINIS. THE TRAGEDIE OF MACBETH.";
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testPObjectMRPipeline() throws IOException {
+ runPObject(new MRPipeline(CollectionPObjectIT.class, tmpDir.getDefaultConfiguration()));
+ }
+
+ @Test
+ public void testAsCollectionMRPipeline() throws IOException {
+ runAsCollection(new MRPipeline(CollectionPObjectIT.class, tmpDir.getDefaultConfiguration()));
+ }
+
+ @Test
+ public void testPObjectMemPipeline() throws IOException {
+ runPObject(MemPipeline.getInstance());
+ }
+
+ @Test
+ public void testAsCollectionMemPipeline() throws IOException {
+ runAsCollection(MemPipeline.getInstance());
+ }
+
+ private PCollection<String> getPCollection(Pipeline pipeline) throws IOException {
+ String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
+ PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
+ return shakespeare;
+ }
+
+ private void verifyLines(String[] lines) {
+ assertEquals("Not enough lines in Shakespeare.", LINES_IN_SHAKES, lines.length);
+ assertEquals("First line in Shakespeare is wrong.", FIRST_SHAKESPEARE_LINE, lines[0]);
+ assertEquals("Last line in Shakespeare is wrong.", LAST_SHAKESPEARE_LINE,
+ lines[lines.length - 1]);
+ }
+
+ public void runPObject(Pipeline pipeline) throws IOException {
+ PCollection<String> shakespeare = getPCollection(pipeline);
+ PObject<Collection<String>> linesP = new CollectionPObject<String>(shakespeare);
+ String[] lines = new String[LINES_IN_SHAKES];
+ lines = linesP.getValue().toArray(lines);
+ verifyLines(lines);
+ }
+
+ public void runAsCollection(Pipeline pipeline) throws IOException {
+ PCollection<String> shakespeare = getPCollection(pipeline);
+ String[] lines = new String[LINES_IN_SHAKES];
+ lines = shakespeare.asCollection().getValue().toArray(lines);
+ verifyLines(lines);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/CollectionsIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/CollectionsIT.java b/crunch-core/src/it/java/org/apache/crunch/CollectionsIT.java
new file mode 100644
index 0000000..17d0cae
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/CollectionsIT.java
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.crunch.fn.Aggregators.SimpleAggregator;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+
+@SuppressWarnings("serial")
+public class CollectionsIT {
+
+ private static class AggregateStringListFn extends SimpleAggregator<Collection<String>> {
+ private final Collection<String> rtn = Lists.newArrayList();
+
+ @Override
+ public void reset() {
+ rtn.clear();
+ }
+
+ @Override
+ public void update(Collection<String> values) {
+ rtn.addAll(values);
+ }
+
+ @Override
+ public Iterable<Collection<String>> results() {
+ return ImmutableList.of(rtn);
+ }
+ }
+
+ private static PTable<String, Collection<String>> listOfCharcters(PCollection<String> lines, PTypeFamily typeFamily) {
+
+ return lines.parallelDo(new DoFn<String, Pair<String, Collection<String>>>() {
+ @Override
+ public void process(String line, Emitter<Pair<String, Collection<String>>> emitter) {
+ for (String word : line.split("\\s+")) {
+ Collection<String> characters = Lists.newArrayList();
+ for (char c : word.toCharArray()) {
+ characters.add(String.valueOf(c));
+ }
+ emitter.emit(Pair.of(word, characters));
+ }
+ }
+ }, typeFamily.tableOf(typeFamily.strings(), typeFamily.collections(typeFamily.strings())))
+ .groupByKey().combineValues(new AggregateStringListFn());
+ }
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testWritables() throws IOException {
+ run(new MRPipeline(CollectionsIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testAvro() throws IOException {
+ run(new MRPipeline(CollectionsIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testInMemoryWritables() throws IOException {
+ run(MemPipeline.getInstance(), WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testInMemoryAvro() throws IOException {
+ run(MemPipeline.getInstance(), AvroTypeFamily.getInstance());
+ }
+
+ public void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
+ String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
+
+ PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
+ Iterable<Pair<String, Collection<String>>> lines = listOfCharcters(shakespeare, typeFamily).materialize();
+
+ boolean passed = false;
+ for (Pair<String, Collection<String>> line : lines) {
+ if (line.first().startsWith("yellow")) {
+ passed = true;
+ break;
+ }
+ }
+ pipeline.done();
+ assertTrue(passed);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java b/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java
new file mode 100644
index 0000000..3a38b92
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/CollectionsLengthIT.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.lang.Long;
+
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.Rule;
+import org.junit.Test;
+
+@SuppressWarnings("serial")
+public class CollectionsLengthIT {
+
+ public static final Long LINES_IN_SHAKESPEARE = 3667L;
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testWritables() throws IOException {
+ run(new MRPipeline(CollectionsIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testAvro() throws IOException {
+ run(new MRPipeline(CollectionsIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testInMemoryWritables() throws IOException {
+ run(MemPipeline.getInstance(), WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testInMemoryAvro() throws IOException {
+ run(MemPipeline.getInstance(), AvroTypeFamily.getInstance());
+ }
+
+ public void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
+ String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
+
+ PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
+ Long length = shakespeare.length().getValue();
+ assertEquals("Incorrect length for shakespear PCollection.", LINES_IN_SHAKESPEARE, length);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java b/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java
new file mode 100644
index 0000000..f1323ca
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.apache.crunch.types.avro.Avros.*;
+import static org.junit.Assert.assertEquals;
+
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PType;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Iterables;
+
+/**
+ *
+ */
+public class DeepCopyCustomTuplesIT {
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ public static class PID extends Pair<Integer, String> {
+ public PID(Integer first, String second) {
+ super(first, second);
+ }
+ }
+
+ private static PType<PID> pids = tuples(PID.class, ints(), strings());
+
+ @Test
+ public void testDeepCopyCustomTuple() throws Exception {
+ Pipeline p = new MRPipeline(DeepCopyCustomTuplesIT.class, tmpDir.getDefaultConfiguration());
+ String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
+ PCollection<String> shakes = p.readTextFile(shakesInputPath);
+ Iterable<String> out = shakes
+ .parallelDo(new PreProcFn(), tableOf(ints(), pairs(ints(), pids)))
+ .groupByKey()
+ .parallelDo(new PostProcFn(), strings())
+ .materialize();
+ assertEquals(65, Iterables.size(out));
+ p.done();
+ }
+
+ private static class PreProcFn extends MapFn<String, Pair<Integer, Pair<Integer, PID>>> {
+ private int counter = 0;
+ @Override
+ public Pair<Integer, Pair<Integer, PID>> map(String input) {
+ return Pair.of(counter++, Pair.of(counter++, new PID(input.length(), input)));
+ }
+ };
+
+ private static class PostProcFn extends DoFn<Pair<Integer, Iterable<Pair<Integer, PID>>>, String> {
+ @Override
+ public void process(Pair<Integer, Iterable<Pair<Integer, PID>>> input, Emitter<String> emitter) {
+ for (Pair<Integer, PID> p : input.second()) {
+ if (p.second().first() > 0 && p.second().first() < 10) {
+ emitter.emit(p.second().second());
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/EnumPairIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/EnumPairIT.java b/crunch-core/src/it/java/org/apache/crunch/EnumPairIT.java
new file mode 100644
index 0000000..1d0974e
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/EnumPairIT.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.io.Serializable;
+
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypes;
+import org.apache.crunch.types.writable.Writables;
+import org.junit.Rule;
+import org.junit.Test;
+
+public class EnumPairIT implements Serializable {
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+ static enum etypes {
+ type1,
+ }
+
+ @Test
+ public void testEnumPTypes() throws IOException {
+ String inputFile1 = tmpDir.copyResourceFileName("set1.txt");
+ Pipeline pipeline = new MRPipeline(EnumPairIT.class);
+ PCollection<String> set1 = pipeline.readTextFile(inputFile1);
+ PTable<String, etypes> data = set1.parallelDo(new DoFn<String, Pair<String, etypes>>() {
+ @Override
+ public void process(String input, Emitter<Pair<String, etypes>> emitter) {
+ emitter.emit(new Pair<String, etypes>(input, etypes.type1));
+ }
+ }, Writables.tableOf(Writables.strings(), PTypes.enums(etypes.class, set1.getTypeFamily())));
+
+ Iterable<Pair<String, etypes>> materialized = data.materialize();
+ pipeline.run();
+ for (Pair<String, etypes> pair : materialized) {
+ assertEquals(etypes.type1, pair.second());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java b/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java
new file mode 100644
index 0000000..d985e10
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/FirstElementPObjectIT.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.lang.String;
+
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PObject;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.materialize.pobject.FirstElementPObject;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.junit.Rule;
+import org.junit.Test;
+
+@SuppressWarnings("serial")
+public class FirstElementPObjectIT {
+
+ private static final String FIRST_SHAKESPEARE_LINE =
+ "***The Project Gutenberg's Etext of Shakespeare's First Folio***";
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testMRPipeline() throws IOException {
+ run(new MRPipeline(FirstElementPObjectIT.class, tmpDir.getDefaultConfiguration()));
+ }
+
+ @Test
+ public void testInMemoryPipeline() throws IOException {
+ run(MemPipeline.getInstance());
+ }
+
+ public void run(Pipeline pipeline) throws IOException {
+ String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
+ PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
+ PObject<String> firstLine = new FirstElementPObject<String>(shakespeare);
+ String first = firstLine.getValue();
+ assertEquals("First line in Shakespeare is wrong.", FIRST_SHAKESPEARE_LINE, first);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/IterableReuseProtectionIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/IterableReuseProtectionIT.java b/crunch-core/src/it/java/org/apache/crunch/IterableReuseProtectionIT.java
new file mode 100644
index 0000000..da487eb
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/IterableReuseProtectionIT.java
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.crunch.fn.IdentityFn;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.writable.Writables;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Verify that calling the iterator method on a Reducer-based Iterable
+ * is forcefully disallowed.
+ */
+public class IterableReuseProtectionIT {
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+
+ public void checkIteratorReuse(Pipeline pipeline) throws IOException {
+ Iterable<String> values = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt"))
+ .by(IdentityFn.<String>getInstance(), Writables.strings())
+ .groupByKey()
+ .combineValues(new TestIterableReuseFn())
+ .values().materialize();
+
+ List<String> valueList = Lists.newArrayList(values);
+ Collections.sort(valueList);
+ assertEquals(Lists.newArrayList("a", "b", "c", "e"), valueList);
+ }
+
+ @Test
+ public void testIteratorReuse_MRPipeline() throws IOException {
+ checkIteratorReuse(new MRPipeline(IterableReuseProtectionIT.class, tmpDir.getDefaultConfiguration()));
+ }
+
+ @Test
+ public void testIteratorReuse_InMemoryPipeline() throws IOException {
+ checkIteratorReuse(MemPipeline.getInstance());
+ }
+
+ static class TestIterableReuseFn extends CombineFn<String, String> {
+
+ @Override
+ public void process(Pair<String, Iterable<String>> input, Emitter<Pair<String, String>> emitter) {
+ StringBuilder combinedBuilder = new StringBuilder();
+ for (String v : input.second()) {
+ combinedBuilder.append(v);
+ }
+
+ try {
+ input.second().iterator();
+ throw new RuntimeException("Second call to iterator should throw an exception");
+ } catch (IllegalStateException e) {
+ // Expected situation
+ }
+ emitter.emit(Pair.of(input.first(), combinedBuilder.toString()));
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/MRPipelineIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/MRPipelineIT.java b/crunch-core/src/it/java/org/apache/crunch/MRPipelineIT.java
new file mode 100644
index 0000000..7670e88
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/MRPipelineIT.java
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+
+import org.apache.crunch.fn.FilterFns;
+import org.apache.crunch.fn.IdentityFn;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.To;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.writable.Writables;
+import org.junit.Rule;
+import org.junit.Test;
+
+public class MRPipelineIT implements Serializable {
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void materializedColShouldBeWritten() throws Exception {
+ File textFile = tmpDir.copyResourceFile("shakes.txt");
+ Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());
+ PCollection<String> genericCollection = pipeline.readTextFile(textFile.getAbsolutePath());
+ pipeline.run();
+ PCollection<String> filter = genericCollection.filter("Filtering data", FilterFns.<String>ACCEPT_ALL());
+ filter.materialize();
+ pipeline.run();
+ File file = tmpDir.getFile("output.txt");
+ Target outFile = To.textFile(file.getAbsolutePath());
+ PCollection<String> write = filter.write(outFile);
+ write.materialize();
+ pipeline.run();
+ }
+
+
+
+ @Test
+ public void testPGroupedTableToMultipleOutputs() throws IOException{
+ Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());
+ PGroupedTable<String, String> groupedLineTable = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt")).by(IdentityFn.<String>getInstance(), Writables.strings()).groupByKey();
+
+ PTable<String, String> ungroupedTableA = groupedLineTable.ungroup();
+ PTable<String, String> ungroupedTableB = groupedLineTable.ungroup();
+
+ File outputDirA = tmpDir.getFile("output_a");
+ File outputDirB = tmpDir.getFile("output_b");
+
+ pipeline.writeTextFile(ungroupedTableA, outputDirA.getAbsolutePath());
+ pipeline.writeTextFile(ungroupedTableB, outputDirB.getAbsolutePath());
+ pipeline.done();
+
+ // Verify that output from a single PGroupedTable can be sent to multiple collections
+ assertTrue(new File(outputDirA, "part-r-00000").exists());
+ assertTrue(new File(outputDirB, "part-r-00000").exists());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/MapPObjectIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/MapPObjectIT.java b/crunch-core/src/it/java/org/apache/crunch/MapPObjectIT.java
new file mode 100644
index 0000000..c48284f
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/MapPObjectIT.java
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.materialize.pobject.MapPObject;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTypeFamily;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableList;
+
+public class MapPObjectIT {
+
+ static final ImmutableList<Pair<Integer, String>> kvPairs = ImmutableList.of(Pair.of(0, "a"), Pair.of(1, "b"),
+ Pair.of(2, "c"), Pair.of(3, "e"));
+
+ public void assertMatches(Map<Integer, String> m) {
+ for (Integer k : m.keySet()) {
+ assertEquals(kvPairs.get(k).second(), m.get(k));
+ }
+ }
+
+ private static class Set1Mapper extends MapFn<String, Pair<Integer, String>> {
+ @Override
+ public Pair<Integer, String> map(String input) {
+
+ int k = -1;
+ if (input.equals("a"))
+ k = 0;
+ else if (input.equals("b"))
+ k = 1;
+ else if (input.equals("c"))
+ k = 2;
+ else if (input.equals("e"))
+ k = 3;
+ return Pair.of(k, input);
+ }
+ }
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testMemMapPObject() {
+ PTable<Integer, String> table = MemPipeline.tableOf(kvPairs);
+ PObject<Map<Integer, String>> map = new MapPObject<Integer, String>(table);
+ assertMatches(map.getValue());
+ }
+
+ @Test
+ public void testMemAsMap() {
+ PTable<Integer, String> table = MemPipeline.tableOf(kvPairs);
+ assertMatches(table.asMap().getValue());
+ }
+
+ private PTable<Integer, String> getMRPTable() throws IOException {
+ Pipeline p = new MRPipeline(MaterializeToMapIT.class, tmpDir.getDefaultConfiguration());
+ String inputFile = tmpDir.copyResourceFileName("set1.txt");
+ PCollection<String> c = p.readTextFile(inputFile);
+ PTypeFamily tf = c.getTypeFamily();
+ PTable<Integer, String> table = c.parallelDo(new Set1Mapper(), tf.tableOf(tf.ints(),
+ tf.strings()));
+ return table;
+ }
+
+ @Test
+ public void testMRMapPObject() throws IOException {
+ PTable<Integer, String> table = getMRPTable();
+ PObject<Map<Integer, String>> map = new MapPObject<Integer, String>(table);
+ assertMatches(map.getValue());
+ }
+
+ @Test
+ public void testMRAsMap() throws IOException {
+ PTable<Integer, String> table = getMRPTable();
+ assertMatches(table.asMap().getValue());
+ }
+}
[28/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/join/InnerJoinFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/join/InnerJoinFn.java b/crunch-core/src/main/java/org/apache/crunch/lib/join/InnerJoinFn.java
new file mode 100644
index 0000000..a3d30d2
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/join/InnerJoinFn.java
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import java.util.List;
+
+import org.apache.crunch.Emitter;
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.PType;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Used to perform the last step of an inner join.
+ *
+ * @param <K> Type of the keys.
+ * @param <U> Type of the first {@link org.apache.crunch.PTable}'s values
+ * @param <V> Type of the second {@link org.apache.crunch.PTable}'s values
+ */
+public class InnerJoinFn<K, U, V> extends JoinFn<K, U, V> {
+
+ private transient K lastKey;
+ private transient List<U> leftValues;
+
+ public InnerJoinFn(PType<K> keyType, PType<U> leftValueType) {
+ super(keyType, leftValueType);
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public void initialize() {
+ super.initialize();
+ lastKey = null;
+ this.leftValues = Lists.newArrayList();
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public void join(K key, int id, Iterable<Pair<U, V>> pairs, Emitter<Pair<K, Pair<U, V>>> emitter) {
+ if (!key.equals(lastKey)) {
+ lastKey = keyType.getDetachedValue(key);
+ leftValues.clear();
+ }
+ if (id == 0) { // from left
+ for (Pair<U, V> pair : pairs) {
+ if (pair.first() != null)
+ leftValues.add(leftValueType.getDetachedValue(pair.first()));
+ }
+ } else { // from right
+ for (Pair<U, V> pair : pairs) {
+ for (U u : leftValues) {
+ emitter.emit(Pair.of(lastKey, Pair.of(u, pair.second())));
+ }
+ }
+ }
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public String getJoinType() {
+ return "innerJoin";
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/join/JoinFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/join/JoinFn.java b/crunch-core/src/main/java/org/apache/crunch/lib/join/JoinFn.java
new file mode 100644
index 0000000..99aea5a
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/join/JoinFn.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.PType;
+
+/**
+ * Represents a {@link org.apache.crunch.DoFn} for performing joins.
+ *
+ * @param <K> Type of the keys.
+ * @param <U> Type of the first {@link org.apache.crunch.PTable}'s values
+ * @param <V> Type of the second {@link org.apache.crunch.PTable}'s values
+ */
+public abstract class JoinFn<K, U, V> extends
+ DoFn<Pair<Pair<K, Integer>, Iterable<Pair<U, V>>>, Pair<K, Pair<U, V>>> {
+
+ protected PType<K> keyType;
+ protected PType<U> leftValueType;
+
+ /**
+ * Instantiate with the PType of the value of the left side of the join (used for creating deep
+ * copies of values).
+ *
+ * @param keyType The PType of the value used as the key of the join
+ * @param leftValueType The PType of the value type of the left side of the join
+ */
+ public JoinFn(PType<K> keyType, PType<U> leftValueType) {
+ this.keyType = keyType;
+ this.leftValueType = leftValueType;
+ }
+
+ @Override
+ public void initialize() {
+ this.keyType.initialize(getConfiguration());
+ this.leftValueType.initialize(getConfiguration());
+ }
+
+ /** @return The name of this join type (e.g. innerJoin, leftOuterJoin). */
+ public abstract String getJoinType();
+
+ /**
+ * Performs the actual joining.
+ *
+ * @param key The key for this grouping of values.
+ * @param id The side that this group of values is from (0 -> left, 1 -> right).
+ * @param pairs The group of values associated with this key and id pair.
+ * @param emitter The emitter to send the output to.
+ */
+ public abstract void join(K key, int id, Iterable<Pair<U, V>> pairs,
+ Emitter<Pair<K, Pair<U, V>>> emitter);
+
+ /**
+ * Split up the input record to make coding a bit more manageable.
+ *
+ * @param input The input record.
+ * @param emitter The emitter to send the output to.
+ */
+ @Override
+ public void process(Pair<Pair<K, Integer>, Iterable<Pair<U, V>>> input,
+ Emitter<Pair<K, Pair<U, V>>> emitter) {
+ join(input.first().first(), input.first().second(), input.second(), emitter);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/join/JoinUtils.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/join/JoinUtils.java b/crunch-core/src/main/java/org/apache/crunch/lib/join/JoinUtils.java
new file mode 100644
index 0000000..6efeccb
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/join/JoinUtils.java
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.IndexedRecord;
+import org.apache.avro.io.BinaryData;
+import org.apache.avro.mapred.AvroJob;
+import org.apache.avro.mapred.AvroKey;
+import org.apache.avro.mapred.AvroValue;
+import org.apache.avro.mapred.AvroWrapper;
+import org.apache.avro.reflect.ReflectData;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.writable.TupleWritable;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.DataInputBuffer;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapreduce.Partitioner;
+
+/**
+ * Utilities that are useful in joining multiple data sets via a MapReduce.
+ *
+ */
+public class JoinUtils {
+
+ public static Class<? extends Partitioner> getPartitionerClass(PTypeFamily typeFamily) {
+ if (typeFamily == WritableTypeFamily.getInstance()) {
+ return TupleWritablePartitioner.class;
+ } else {
+ return AvroIndexedRecordPartitioner.class;
+ }
+ }
+
+ public static Class<? extends RawComparator> getGroupingComparator(PTypeFamily typeFamily) {
+ if (typeFamily == WritableTypeFamily.getInstance()) {
+ return TupleWritableComparator.class;
+ } else {
+ return AvroPairGroupingComparator.class;
+ }
+ }
+
+ public static class TupleWritablePartitioner extends Partitioner<TupleWritable, Writable> {
+ @Override
+ public int getPartition(TupleWritable key, Writable value, int numPartitions) {
+ return (Math.abs(key.get(0).hashCode()) & Integer.MAX_VALUE) % numPartitions;
+ }
+ }
+
+ public static class TupleWritableComparator implements RawComparator<TupleWritable> {
+
+ private DataInputBuffer buffer = new DataInputBuffer();
+ private TupleWritable key1 = new TupleWritable();
+ private TupleWritable key2 = new TupleWritable();
+
+ @Override
+ public int compare(TupleWritable o1, TupleWritable o2) {
+ return ((WritableComparable) o1.get(0)).compareTo((WritableComparable) o2.get(0));
+ }
+
+ @Override
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ try {
+ buffer.reset(b1, s1, l1);
+ key1.readFields(buffer);
+
+ buffer.reset(b2, s2, l2);
+ key2.readFields(buffer);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+
+ return compare(key1, key2);
+ }
+ }
+
+ public static class AvroIndexedRecordPartitioner<K, V> extends Partitioner<AvroKey<K>, AvroValue<V>> {
+ @Override
+ public int getPartition(AvroKey<K> key, AvroValue<V> value, int numPartitions) {
+ IndexedRecord record = (IndexedRecord) key.datum();
+ return (Math.abs(record.get(0).hashCode()) & Integer.MAX_VALUE) % numPartitions;
+ }
+ }
+
+ public static class AvroPairGroupingComparator<T> extends Configured implements RawComparator<AvroWrapper<T>> {
+ private Schema schema;
+
+ @Override
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ if (conf != null) {
+ Schema mapOutputSchema = AvroJob.getMapOutputSchema(conf);
+ Schema keySchema = org.apache.avro.mapred.Pair.getKeySchema(mapOutputSchema);
+ schema = keySchema.getFields().get(0).schema();
+ }
+ }
+
+ @Override
+ public int compare(AvroWrapper<T> x, AvroWrapper<T> y) {
+ return ReflectData.get().compare(x.datum(), y.datum(), schema);
+ }
+
+ @Override
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+ return BinaryData.compare(b1, s1, l1, b2, s2, l2, schema);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/join/LeftOuterJoinFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/join/LeftOuterJoinFn.java b/crunch-core/src/main/java/org/apache/crunch/lib/join/LeftOuterJoinFn.java
new file mode 100644
index 0000000..731c496
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/join/LeftOuterJoinFn.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import java.util.List;
+
+import org.apache.crunch.Emitter;
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.PType;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Used to perform the last step of an left outer join.
+ *
+ * @param <K> Type of the keys.
+ * @param <U> Type of the first {@link org.apache.crunch.PTable}'s values
+ * @param <V> Type of the second {@link org.apache.crunch.PTable}'s values
+ */
+public class LeftOuterJoinFn<K, U, V> extends JoinFn<K, U, V> {
+
+ private transient int lastId;
+ private transient K lastKey;
+ private transient List<U> leftValues;
+
+ public LeftOuterJoinFn(PType<K> keyType, PType<U> leftValueType) {
+ super(keyType, leftValueType);
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public void initialize() {
+ super.initialize();
+ lastId = 1;
+ lastKey = null;
+ this.leftValues = Lists.newArrayList();
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public void join(K key, int id, Iterable<Pair<U, V>> pairs, Emitter<Pair<K, Pair<U, V>>> emitter) {
+ if (!key.equals(lastKey)) {
+ // Make sure that left side always gets emitted.
+ if (0 == lastId) {
+ for (U u : leftValues) {
+ emitter.emit(Pair.of(lastKey, Pair.of(u, (V) null)));
+ }
+ }
+ lastKey = keyType.getDetachedValue(key);
+ leftValues.clear();
+ }
+ if (id == 0) {
+ for (Pair<U, V> pair : pairs) {
+ if (pair.first() != null)
+ leftValues.add(leftValueType.getDetachedValue(pair.first()));
+ }
+ } else {
+ for (Pair<U, V> pair : pairs) {
+ for (U u : leftValues) {
+ emitter.emit(Pair.of(lastKey, Pair.of(u, pair.second())));
+ }
+ }
+ }
+
+ lastId = id;
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public void cleanup(Emitter<Pair<K, Pair<U, V>>> emitter) {
+ if (0 == lastId) {
+ for (U u : leftValues) {
+ emitter.emit(Pair.of(lastKey, Pair.of(u, (V) null)));
+ }
+ }
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public String getJoinType() {
+ return "leftOuterJoin";
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/join/MapsideJoin.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/join/MapsideJoin.java b/crunch-core/src/main/java/org/apache/crunch/lib/join/MapsideJoin.java
new file mode 100644
index 0000000..56476c1
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/join/MapsideJoin.java
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import java.io.IOException;
+
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.ParallelDoOptions;
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.io.ReadableSourceTarget;
+import org.apache.crunch.materialize.MaterializableIterable;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.util.DistCache;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+
+/**
+ * Utility for doing map side joins on a common key between two {@link PTable}s.
+ * <p>
+ * A map side join is an optimized join which doesn't use a reducer; instead,
+ * the right side of the join is loaded into memory and the join is performed in
+ * a mapper. This style of join has the important implication that the output of
+ * the join is not sorted, which is the case with a conventional (reducer-based)
+ * join.
+ * <p>
+ * <b>Note:</b>This utility is only supported when running with a
+ * {@link MRPipeline} as the pipeline.
+ */
+public class MapsideJoin {
+
+ /**
+ * Join two tables using a map side join. The right-side table will be loaded
+ * fully in memory, so this method should only be used if the right side
+ * table's contents can fit in the memory allocated to mappers. The join
+ * performed by this method is an inner join.
+ *
+ * @param left
+ * The left-side table of the join
+ * @param right
+ * The right-side table of the join, whose contents will be fully
+ * read into memory
+ * @return A table keyed on the join key, containing pairs of joined values
+ */
+ public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right) {
+ PTypeFamily tf = left.getTypeFamily();
+ Iterable<Pair<K, V>> iterable = right.materialize();
+
+ if (iterable instanceof MaterializableIterable) {
+ MaterializableIterable<Pair<K, V>> mi = (MaterializableIterable<Pair<K, V>>) iterable;
+ MapsideJoinDoFn<K, U, V> mapJoinDoFn = new MapsideJoinDoFn<K, U, V>(mi.getPath().toString(),
+ right.getPType());
+ ParallelDoOptions.Builder optionsBuilder = ParallelDoOptions.builder();
+ if (mi.isSourceTarget()) {
+ optionsBuilder.sourceTargets((SourceTarget) mi.getSource());
+ }
+ return left.parallelDo("mapjoin", mapJoinDoFn,
+ tf.tableOf(left.getKeyType(), tf.pairs(left.getValueType(), right.getValueType())),
+ optionsBuilder.build());
+ } else { // in-memory pipeline
+ return left.parallelDo(new InMemoryJoinFn<K, U, V>(iterable),
+ tf.tableOf(left.getKeyType(), tf.pairs(left.getValueType(), right.getValueType())));
+ }
+ }
+
+ static class InMemoryJoinFn<K, U, V> extends DoFn<Pair<K, U>, Pair<K, Pair<U, V>>> {
+
+ private Multimap<K, V> joinMap;
+
+ public InMemoryJoinFn(Iterable<Pair<K, V>> iterable) {
+ joinMap = HashMultimap.create();
+ for (Pair<K, V> joinPair : iterable) {
+ joinMap.put(joinPair.first(), joinPair.second());
+ }
+ }
+
+ @Override
+ public void process(Pair<K, U> input, Emitter<Pair<K, Pair<U, V>>> emitter) {
+ K key = input.first();
+ U value = input.second();
+ for (V joinValue : joinMap.get(key)) {
+ Pair<U, V> valuePair = Pair.of(value, joinValue);
+ emitter.emit(Pair.of(key, valuePair));
+ }
+ }
+ }
+
+ static class MapsideJoinDoFn<K, U, V> extends DoFn<Pair<K, U>, Pair<K, Pair<U, V>>> {
+
+ private String inputPath;
+ private PType<Pair<K, V>> ptype;
+ private Multimap<K, V> joinMap;
+
+ public MapsideJoinDoFn(String inputPath, PType<Pair<K, V>> ptype) {
+ this.inputPath = inputPath;
+ this.ptype = ptype;
+ }
+
+ private Path getCacheFilePath() {
+ Path local = DistCache.getPathToCacheFile(new Path(inputPath), getConfiguration());
+ if (local == null) {
+ throw new CrunchRuntimeException("Can't find local cache file for '" + inputPath + "'");
+ }
+ return local;
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ DistCache.addCacheFile(new Path(inputPath), conf);
+ }
+
+ @Override
+ public void initialize() {
+ super.initialize();
+
+ ReadableSourceTarget<Pair<K, V>> sourceTarget = ptype.getDefaultFileSource(
+ getCacheFilePath());
+ Iterable<Pair<K, V>> iterable = null;
+ try {
+ iterable = sourceTarget.read(getConfiguration());
+ } catch (IOException e) {
+ throw new CrunchRuntimeException("Error reading right-side of map side join: ", e);
+ }
+
+ joinMap = ArrayListMultimap.create();
+ for (Pair<K, V> joinPair : iterable) {
+ joinMap.put(joinPair.first(), joinPair.second());
+ }
+ }
+
+ @Override
+ public void process(Pair<K, U> input, Emitter<Pair<K, Pair<U, V>>> emitter) {
+ K key = input.first();
+ U value = input.second();
+ for (V joinValue : joinMap.get(key)) {
+ Pair<U, V> valuePair = Pair.of(value, joinValue);
+ emitter.emit(Pair.of(key, valuePair));
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/join/RightOuterJoinFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/join/RightOuterJoinFn.java b/crunch-core/src/main/java/org/apache/crunch/lib/join/RightOuterJoinFn.java
new file mode 100644
index 0000000..2789d40
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/join/RightOuterJoinFn.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import java.util.List;
+
+import org.apache.crunch.Emitter;
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.PType;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Used to perform the last step of an right outer join.
+ *
+ * @param <K> Type of the keys.
+ * @param <U> Type of the first {@link org.apache.crunch.PTable}'s values
+ * @param <V> Type of the second {@link org.apache.crunch.PTable}'s values
+ */
+public class RightOuterJoinFn<K, U, V> extends JoinFn<K, U, V> {
+
+ private transient K lastKey;
+ private transient List<U> leftValues;
+
+ public RightOuterJoinFn(PType<K> keyType, PType<U> leftValueType) {
+ super(keyType, leftValueType);
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public void initialize() {
+ super.initialize();
+ lastKey = null;
+ this.leftValues = Lists.newArrayList();
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public void join(K key, int id, Iterable<Pair<U, V>> pairs, Emitter<Pair<K, Pair<U, V>>> emitter) {
+ if (!key.equals(lastKey)) {
+ lastKey = keyType.getDetachedValue(key);
+ leftValues.clear();
+ }
+ if (id == 0) {
+ for (Pair<U, V> pair : pairs) {
+ if (pair.first() != null)
+ leftValues.add(leftValueType.getDetachedValue(pair.first()));
+ }
+ } else {
+ for (Pair<U, V> pair : pairs) {
+ // Make sure that right side gets emitted.
+ if (leftValues.isEmpty()) {
+ leftValues.add(null);
+ }
+
+ for (U u : leftValues) {
+ emitter.emit(Pair.of(lastKey, Pair.of(u, pair.second())));
+ }
+ }
+ }
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public String getJoinType() {
+ return "rightOuterJoin";
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/join/package-info.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/join/package-info.java b/crunch-core/src/main/java/org/apache/crunch/lib/join/package-info.java
new file mode 100644
index 0000000..f1ad9f1
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/join/package-info.java
@@ -0,0 +1,22 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Inner and outer joins on collections.
+ */
+package org.apache.crunch.lib.join;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/package-info.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/package-info.java b/crunch-core/src/main/java/org/apache/crunch/lib/package-info.java
new file mode 100644
index 0000000..2695787
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/package-info.java
@@ -0,0 +1,22 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Joining, sorting, aggregating, and other commonly used functionality.
+ */
+package org.apache.crunch.lib;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/sort/Comparators.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/sort/Comparators.java b/crunch-core/src/main/java/org/apache/crunch/lib/sort/Comparators.java
new file mode 100644
index 0000000..ae7f49a
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/sort/Comparators.java
@@ -0,0 +1,187 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.sort;
+
+import java.util.Arrays;
+
+import org.apache.avro.Schema;
+import org.apache.avro.io.BinaryData;
+import org.apache.avro.mapred.AvroKey;
+import org.apache.avro.reflect.ReflectData;
+import org.apache.crunch.lib.Sort.ColumnOrder;
+import org.apache.crunch.lib.Sort.Order;
+import org.apache.crunch.types.writable.TupleWritable;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+import org.apache.hadoop.mapred.JobConf;
+
+import com.google.common.base.Function;
+import com.google.common.base.Joiner;
+import com.google.common.collect.Iterables;
+
+/**
+ * A collection of {@code RawComparator<T>} implementations that are used by Crunch's {@code Sort} library.
+ */
+public class Comparators {
+
+ public static class ReverseWritableComparator<T> extends Configured implements RawComparator<T> {
+
+ private RawComparator<T> comparator;
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ if (conf != null) {
+ JobConf jobConf = new JobConf(conf);
+ comparator = WritableComparator.get(jobConf.getMapOutputKeyClass().asSubclass(WritableComparable.class));
+ }
+ }
+
+ @Override
+ public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3, int arg4, int arg5) {
+ return -comparator.compare(arg0, arg1, arg2, arg3, arg4, arg5);
+ }
+
+ @Override
+ public int compare(T o1, T o2) {
+ return -comparator.compare(o1, o2);
+ }
+ }
+
+ public static class ReverseAvroComparator<T> extends Configured implements RawComparator<AvroKey<T>> {
+
+ private Schema schema;
+
+ @Override
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ if (conf != null) {
+ schema = (new Schema.Parser()).parse(conf.get("crunch.schema"));
+ }
+ }
+
+ @Override
+ public int compare(AvroKey<T> o1, AvroKey<T> o2) {
+ return -ReflectData.get().compare(o1.datum(), o2.datum(), schema);
+ }
+
+ @Override
+ public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3, int arg4, int arg5) {
+ return -BinaryData.compare(arg0, arg1, arg2, arg3, arg4, arg5, schema);
+ }
+ }
+
+ public static class TupleWritableComparator extends WritableComparator implements Configurable {
+
+ private static final String CRUNCH_ORDERING_PROPERTY = "crunch.ordering";
+
+ private Configuration conf;
+ private ColumnOrder[] columnOrders;
+
+ public TupleWritableComparator() {
+ super(TupleWritable.class, true);
+ }
+
+ public static void configureOrdering(Configuration conf, Order... orders) {
+ conf.set(CRUNCH_ORDERING_PROPERTY,
+ Joiner.on(",").join(Iterables.transform(Arrays.asList(orders), new Function<Order, String>() {
+ @Override
+ public String apply(Order o) {
+ return o.name();
+ }
+ })));
+ }
+
+ public static void configureOrdering(Configuration conf, ColumnOrder... columnOrders) {
+ conf.set(CRUNCH_ORDERING_PROPERTY,
+ Joiner.on(",").join(Iterables.transform(Arrays.asList(columnOrders), new Function<ColumnOrder, String>() {
+ @Override
+ public String apply(ColumnOrder o) {
+ return o.column() + ";" + o.order().name();
+ }
+ })));
+ }
+
+ @Override
+ public int compare(WritableComparable a, WritableComparable b) {
+ TupleWritable ta = (TupleWritable) a;
+ TupleWritable tb = (TupleWritable) b;
+ for (int index = 0; index < columnOrders.length; index++) {
+ int order = 1;
+ if (columnOrders[index].order() == Order.ASCENDING) {
+ order = 1;
+ } else if (columnOrders[index].order() == Order.DESCENDING) {
+ order = -1;
+ } else { // ignore
+ continue;
+ }
+ if (!ta.has(index) && !tb.has(index)) {
+ continue;
+ } else if (ta.has(index) && !tb.has(index)) {
+ return order;
+ } else if (!ta.has(index) && tb.has(index)) {
+ return -order;
+ } else {
+ Writable v1 = ta.get(index);
+ Writable v2 = tb.get(index);
+ if (v1 != v2 && (v1 != null && !v1.equals(v2))) {
+ if (v1 instanceof WritableComparable && v2 instanceof WritableComparable) {
+ int cmp = ((WritableComparable) v1).compareTo((WritableComparable) v2);
+ if (cmp != 0) {
+ return order * cmp;
+ }
+ } else {
+ int cmp = v1.hashCode() - v2.hashCode();
+ if (cmp != 0) {
+ return order * cmp;
+ }
+ }
+ }
+ }
+ }
+ return 0; // ordering using specified cols found no differences
+ }
+
+ @Override
+ public Configuration getConf() {
+ return conf;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ if (conf != null) {
+ String ordering = conf.get(CRUNCH_ORDERING_PROPERTY);
+ String[] columnOrderNames = ordering.split(",");
+ columnOrders = new ColumnOrder[columnOrderNames.length];
+ for (int i = 0; i < columnOrders.length; i++) {
+ String[] split = columnOrderNames[i].split(";");
+ int column = Integer.parseInt(split[0]);
+ Order order = Order.valueOf(split[1]);
+ columnOrders[i] = ColumnOrder.by(column, order);
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/sort/SortFns.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/sort/SortFns.java b/crunch-core/src/main/java/org/apache/crunch/lib/sort/SortFns.java
new file mode 100644
index 0000000..be218f6
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/sort/SortFns.java
@@ -0,0 +1,210 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.sort;
+
+import java.util.List;
+import java.util.UUID;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Tuple;
+import org.apache.crunch.lib.Sort.ColumnOrder;
+import org.apache.crunch.lib.Sort.Order;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.TupleFactory;
+import org.apache.crunch.types.avro.AvroType;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.avro.Avros;
+
+import com.google.common.collect.Lists;
+
+/**
+ * A set of {@code DoFn}s that are used by Crunch's {@code Sort} library.
+ */
+public class SortFns {
+
+ /**
+ * Extracts a single indexed key from a {@code Tuple} instance.
+ */
+ public static class SingleKeyFn<V extends Tuple, K> extends MapFn<V, K> {
+ private final int index;
+
+ public SingleKeyFn(int index) {
+ this.index = index;
+ }
+
+ @Override
+ public K map(V input) {
+ return (K) input.get(index);
+ }
+ }
+
+ /**
+ * Extracts a composite key from a {@code Tuple} instance.
+ */
+ public static class TupleKeyFn<V extends Tuple, K extends Tuple> extends MapFn<V, K> {
+ private final int[] indices;
+ private final TupleFactory tupleFactory;
+
+ public TupleKeyFn(int[] indices, TupleFactory tupleFactory) {
+ this.indices = indices;
+ this.tupleFactory = tupleFactory;
+ }
+
+ @Override
+ public K map(V input) {
+ Object[] values = new Object[indices.length];
+ for (int i = 0; i < indices.length; i++) {
+ values[i] = input.get(indices[i]);
+ }
+ return (K) tupleFactory.makeTuple(values);
+ }
+ }
+
+ /**
+ * Pulls a composite set of keys from an Avro {@code GenericRecord} instance.
+ */
+ public static class AvroGenericFn<V extends Tuple> extends MapFn<V, GenericRecord> {
+
+ private final int[] indices;
+ private final String schemaJson;
+ private transient Schema schema;
+
+ public AvroGenericFn(int[] indices, Schema schema) {
+ this.indices = indices;
+ this.schemaJson = schema.toString();
+ }
+
+ @Override
+ public void initialize() {
+ this.schema = (new Schema.Parser()).parse(schemaJson);
+ }
+
+ @Override
+ public GenericRecord map(V input) {
+ GenericRecord rec = new GenericData.Record(schema);
+ for (int i = 0; i < indices.length; i++) {
+ rec.put(i, input.get(indices[i]));
+ }
+ return rec;
+ }
+ }
+
+ /**
+ * Constructs an Avro schema for the given {@code PType<S>} that respects the given column
+ * orderings.
+ */
+ public static <S> Schema createOrderedTupleSchema(PType<S> ptype, ColumnOrder[] orders) {
+ // Guarantee each tuple schema has a globally unique name
+ String tupleName = "tuple" + UUID.randomUUID().toString().replace('-', 'x');
+ Schema schema = Schema.createRecord(tupleName, "", "crunch", false);
+ List<Schema.Field> fields = Lists.newArrayList();
+ AvroType<S> parentAvroType = (AvroType<S>) ptype;
+ Schema parentAvroSchema = parentAvroType.getSchema();
+
+ for (int index = 0; index < orders.length; index++) {
+ ColumnOrder columnOrder = orders[index];
+ AvroType<?> atype = (AvroType<?>) ptype.getSubTypes().get(index);
+ Schema fieldSchema = atype.getSchema();
+ String fieldName = parentAvroSchema.getFields().get(index).name();
+ // Note: avro sorting of strings is inverted relative to how sorting works for WritableComparable
+ // Text instances: making this consistent
+ Schema.Field.Order order = columnOrder.order() == Order.DESCENDING ? Schema.Field.Order.DESCENDING :
+ Schema.Field.Order.ASCENDING;
+ fields.add(new Schema.Field(fieldName, fieldSchema, "", null, order));
+ }
+ schema.setFields(fields);
+ return schema;
+ }
+
+ /**
+ * Utility class for encapsulating key extraction logic and serialization information about
+ * key extraction.
+ */
+ public static class KeyExtraction<V extends Tuple> {
+
+ private PType<V> ptype;
+ private final ColumnOrder[] columnOrder;
+ private final int[] cols;
+
+ private MapFn<V, Object> byFn;
+ private PType<Object> keyPType;
+
+ public KeyExtraction(PType<V> ptype, ColumnOrder[] columnOrder) {
+ this.ptype = ptype;
+ this.columnOrder = columnOrder;
+ this.cols = new int[columnOrder.length];
+ for (int i = 0; i < columnOrder.length; i++) {
+ cols[i] = columnOrder[i].column() - 1;
+ }
+ init();
+ }
+
+ private void init() {
+ List<PType> pt = ptype.getSubTypes();
+ PTypeFamily ptf = ptype.getFamily();
+ if (cols.length == 1) {
+ byFn = new SingleKeyFn(cols[0]);
+ keyPType = pt.get(cols[0]);
+ } else {
+ TupleFactory tf = null;
+ switch (cols.length) {
+ case 2:
+ tf = TupleFactory.PAIR;
+ keyPType = ptf.pairs(pt.get(cols[0]), pt.get(cols[1]));
+ break;
+ case 3:
+ tf = TupleFactory.TUPLE3;
+ keyPType = ptf.triples(pt.get(cols[0]), pt.get(cols[1]), pt.get(cols[2]));
+ break;
+ case 4:
+ tf = TupleFactory.TUPLE4;
+ keyPType = ptf.quads(pt.get(cols[0]), pt.get(cols[1]), pt.get(cols[2]), pt.get(cols[3]));
+ break;
+ default:
+ PType[] pts = new PType[cols.length];
+ for (int i = 0; i < pts.length; i++) {
+ pts[i] = pt.get(cols[i]);
+ }
+ tf = TupleFactory.TUPLEN;
+ keyPType = (PType<Object>) (PType<?>) ptf.tuples(pts);
+ }
+
+ if (ptf == AvroTypeFamily.getInstance()) {
+ Schema s = createOrderedTupleSchema(keyPType, columnOrder);
+ keyPType = (PType<Object>) (PType<?>) Avros.generics(s);
+ byFn = new AvroGenericFn(cols, s);
+ } else {
+ byFn = new TupleKeyFn(cols, tf);
+ }
+ }
+
+ }
+
+ public MapFn<V, Object> getByFn() {
+ return byFn;
+ }
+
+ public PType<Object> getKeyType() {
+ return keyPType;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/sort/TotalOrderPartitioner.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/sort/TotalOrderPartitioner.java b/crunch-core/src/main/java/org/apache/crunch/lib/sort/TotalOrderPartitioner.java
new file mode 100644
index 0000000..94fbdbe
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/sort/TotalOrderPartitioner.java
@@ -0,0 +1,145 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.sort;
+
+import java.io.IOException;
+import java.lang.reflect.Array;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+
+import org.apache.avro.Schema;
+import org.apache.avro.mapred.AvroKey;
+import org.apache.crunch.io.CompositePathIterable;
+import org.apache.crunch.io.avro.AvroFileReaderFactory;
+import org.apache.crunch.io.seq.SeqFileReaderFactory;
+import org.apache.crunch.types.writable.WritableDeepCopier;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Partitioner;
+
+/**
+ * A partition-aware {@code Partitioner} instance that can work with either Avro or Writable-formatted
+ * keys.
+ */
+public class TotalOrderPartitioner<K, V> extends Partitioner<K, V> implements Configurable {
+
+ public static final String DEFAULT_PATH = "_partition.lst";
+ public static final String PARTITIONER_PATH =
+ "crunch.totalorderpartitioner.path";
+
+ private Configuration conf;
+ private Node<K> partitions;
+
+ @Override
+ public Configuration getConf() {
+ return conf;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ try {
+ this.conf = conf;
+ String parts = getPartitionFile(conf);
+ final Path partFile = new Path(parts);
+ final FileSystem fs = (DEFAULT_PATH.equals(parts))
+ ? FileSystem.getLocal(conf) // assume in DistributedCache
+ : partFile.getFileSystem(conf);
+
+ Job job = new Job(conf);
+ Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass();
+ RawComparator<K> comparator =
+ (RawComparator<K>) job.getSortComparator();
+ K[] splitPoints = readPartitions(fs, partFile, keyClass, conf, comparator);
+ int numReduceTasks = job.getNumReduceTasks();
+ if (splitPoints.length != numReduceTasks - 1) {
+ throw new IOException("Wrong number of partitions in keyset");
+ }
+ partitions = new BinarySearchNode(splitPoints, comparator);
+ } catch (IOException e) {
+ throw new IllegalArgumentException("Can't read partitions file", e);
+ }
+ }
+
+ @Override
+ public int getPartition(K key, V value, int modulo) {
+ return partitions.findPartition(key);
+ }
+
+ public static void setPartitionFile(Configuration conf, Path p) {
+ conf.set(PARTITIONER_PATH, p.toString());
+ }
+
+ public static String getPartitionFile(Configuration conf) {
+ return conf.get(PARTITIONER_PATH, DEFAULT_PATH);
+ }
+
+ @SuppressWarnings("unchecked") // map output key class
+ private K[] readPartitions(FileSystem fs, Path p, Class<K> keyClass,
+ Configuration conf, final RawComparator<K> comparator) throws IOException {
+ ArrayList<K> parts = new ArrayList<K>();
+ String schema = conf.get("crunch.schema");
+ if (schema != null) {
+ Schema s = (new Schema.Parser()).parse(schema);
+ AvroFileReaderFactory<K> a = new AvroFileReaderFactory<K>(s);
+ Iterator<K> iter = CompositePathIterable.create(fs, p, a).iterator();
+ while (iter.hasNext()) {
+ parts.add((K) new AvroKey<K>(iter.next()));
+ }
+ } else {
+ WritableDeepCopier wdc = new WritableDeepCopier(keyClass);
+ SeqFileReaderFactory<K> s = new SeqFileReaderFactory<K>(keyClass);
+ Iterator<K> iter = CompositePathIterable.create(fs, p, s).iterator();
+ while (iter.hasNext()) {
+ parts.add((K) wdc.deepCopy((Writable) iter.next()));
+ }
+ }
+ Collections.sort(parts, comparator);
+ return parts.toArray((K[])Array.newInstance(keyClass, parts.size()));
+ }
+
+ /**
+ * Interface to the partitioner to locate a key in the partition keyset.
+ */
+ interface Node<T> {
+ /**
+ * Locate partition in keyset K, st [Ki..Ki+1) defines a partition,
+ * with implicit K0 = -inf, Kn = +inf, and |K| = #partitions - 1.
+ */
+ int findPartition(T key);
+ }
+
+ class BinarySearchNode implements Node<K> {
+ private final K[] splitPoints;
+ private final RawComparator<K> comparator;
+ BinarySearchNode(K[] splitPoints, RawComparator<K> comparator) {
+ this.splitPoints = splitPoints;
+ this.comparator = comparator;
+ }
+ public int findPartition(K key) {
+ final int pos = Arrays.binarySearch(splitPoints, key, comparator) + 1;
+ return (pos < 0) ? -pos : pos;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/materialize/MaterializableIterable.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/materialize/MaterializableIterable.java b/crunch-core/src/main/java/org/apache/crunch/materialize/MaterializableIterable.java
new file mode 100644
index 0000000..2dcc64f
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/materialize/MaterializableIterable.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.materialize;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.io.PathTarget;
+import org.apache.crunch.io.ReadableSource;
+import org.apache.crunch.io.impl.FileSourceImpl;
+import org.apache.hadoop.fs.Path;
+
+public class MaterializableIterable<E> implements Iterable<E> {
+
+ private static final Log LOG = LogFactory.getLog(MaterializableIterable.class);
+
+ private final Pipeline pipeline;
+ private final ReadableSource<E> source;
+ private Iterable<E> materialized;
+
+ public MaterializableIterable(Pipeline pipeline, ReadableSource<E> source) {
+ this.pipeline = pipeline;
+ this.source = source;
+ this.materialized = null;
+ }
+
+ public ReadableSource<E> getSource() {
+ return source;
+ }
+
+ public boolean isSourceTarget() {
+ return (source instanceof SourceTarget);
+ }
+
+ public Path getPath() {
+ if (source instanceof FileSourceImpl) {
+ return ((FileSourceImpl) source).getPath();
+ } else if (source instanceof PathTarget) {
+ return ((PathTarget) source).getPath();
+ }
+ return null;
+ }
+
+ @Override
+ public Iterator<E> iterator() {
+ if (materialized == null) {
+ pipeline.run();
+ materialize();
+ }
+ return materialized.iterator();
+ }
+
+ public void materialize() {
+ try {
+ materialized = source.read(pipeline.getConfiguration());
+ } catch (IOException e) {
+ LOG.error("Could not materialize: " + source, e);
+ throw new CrunchRuntimeException(e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/materialize/MaterializableMap.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/materialize/MaterializableMap.java b/crunch-core/src/main/java/org/apache/crunch/materialize/MaterializableMap.java
new file mode 100644
index 0000000..69082e2
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/materialize/MaterializableMap.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.materialize;
+
+import java.util.AbstractMap;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.crunch.Pair;
+
+public class MaterializableMap<K, V> extends AbstractMap<K, V> {
+
+ private Iterable<Pair<K, V>> iterable;
+ private Set<Map.Entry<K, V>> entrySet;
+
+ public MaterializableMap(Iterable<Pair<K, V>> iterable) {
+ this.iterable = iterable;
+ }
+
+ private Set<Map.Entry<K, V>> toMapEntries(Iterable<Pair<K, V>> xs) {
+ HashMap<K, V> m = new HashMap<K, V>();
+ for (Pair<K, V> x : xs)
+ m.put(x.first(), x.second());
+ return m.entrySet();
+ }
+
+ @Override
+ public Set<Map.Entry<K, V>> entrySet() {
+ if (entrySet == null)
+ entrySet = toMapEntries(iterable);
+ return entrySet;
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/CollectionPObject.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/CollectionPObject.java b/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/CollectionPObject.java
new file mode 100644
index 0000000..60e64b1
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/CollectionPObject.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.materialize.pobject;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Iterator;
+
+import org.apache.crunch.PCollection;
+
+/**
+ * A concrete implementation of {@link org.apache.crunch.materialize.pobject.PObjectImpl} whose
+ * value is a Java {@link java.util.Collection} containing the elements of the underlying {@link
+ * PCollection} for this {@link org.apache.crunch.PObject}.
+ *
+ * @param <S> The value type for elements contained in the {@code Collection} value encapsulated
+ * by this {@code PObject}.
+ */
+public class CollectionPObject<S> extends PObjectImpl<S, Collection<S>> {
+
+ /**
+ * Constructs a new instance of this {@code PObject} implementation.
+ *
+ * @param collect The backing {@code PCollection} for this {@code PObject}.
+ */
+ public CollectionPObject(PCollection<S> collect) {
+ super(collect);
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public Collection<S> process(Iterable<S> input) {
+ Collection<S> target = new ArrayList<S>();
+ Iterator<S> itr = input.iterator();
+ while (itr.hasNext()) {
+ target.add(itr.next());
+ }
+ return target;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/FirstElementPObject.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/FirstElementPObject.java b/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/FirstElementPObject.java
new file mode 100644
index 0000000..aa5fd9e
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/FirstElementPObject.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.materialize.pobject;
+
+import java.util.Iterator;
+
+import org.apache.crunch.PCollection;
+
+/**
+ * A concrete implementation of {@link PObjectImpl} that uses the first element in the backing
+ * {@link PCollection} as the {@link org.apache.crunch.PObject} value.
+ *
+ * @param <T> The value type of this {@code PObject}.
+ */
+public class FirstElementPObject<T> extends PObjectImpl<T, T> {
+
+ /**
+ * Constructs a new instance of this {@code PObject} implementation.
+ *
+ * @param collect The backing {@code PCollection} for this {@code PObject}.
+ */
+ public FirstElementPObject(PCollection<T> collect) {
+ super(collect);
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public T process(Iterable<T> input) {
+ Iterator<T> itr = input.iterator();
+ if (itr.hasNext()) {
+ return itr.next();
+ }
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/MapPObject.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/MapPObject.java b/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/MapPObject.java
new file mode 100644
index 0000000..243997f
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/MapPObject.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.materialize.pobject;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.crunch.PCollection;
+import org.apache.crunch.Pair;
+
+/**
+ * A concrete implementation of {@link PObjectImpl} whose
+ * value is a Java {@link Map}. The underlying {@link PCollection} for this
+ * {@link org.apache.crunch.PObject} must contain {@link Pair}s of values. The
+ * first element of the pair will be used as the map key, while the second element will be used
+ * as the map value. Note that the contents of the underlying {@code PCollection} may not be
+ * reflected in the returned {@code Map}, since a single key may be mapped to several values in
+ * the underlying {@code PCollection}, and only one of those values will appear in the {@code
+ * Map} encapsulated by this {@code PObject}.
+ *
+ * @param <K> The type of keys for the Map.
+ * @param <V> The type of values for the Map.
+ */
+public class MapPObject<K, V> extends PObjectImpl<Pair<K, V>, Map<K, V>> {
+
+ /**
+ * Constructs a new instance of this {@code PObject} implementation.
+ *
+ * @param collect The backing {@code PCollection} for this {@code PObject}.
+ */
+ public MapPObject(PCollection<Pair<K, V>> collect) {
+ super(collect);
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public Map<K, V> process(Iterable<Pair<K, V>> input) {
+ Map<K, V> target = new HashMap<K, V>();
+ Iterator<Pair<K, V>> itr = input.iterator();
+ while (itr.hasNext()) {
+ Pair<K, V> pair = itr.next();
+ target.put(pair.first(), pair.second());
+ }
+ return target;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/PObjectImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/PObjectImpl.java b/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/PObjectImpl.java
new file mode 100644
index 0000000..59c2ba2
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/materialize/pobject/PObjectImpl.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.materialize.pobject;
+
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PObject;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.Target;
+
+/**
+ * An abstract implementation of {@link PObject} that is backed by a {@link PCollection}.
+ * Clients creating a concrete implementation should override the method
+ * {@link PObjectImpl#process(Iterable)}, which transforms the backing PCollection into the
+ * singleton value encapsulated by the PObject. Once this {code PObject}'s value has been
+ * calculated, the value is cached to prevent subsequent materializations of the backing
+ * {@code PCollection}.
+ *
+ * @param <S> The type contained in the underlying PCollection.
+ * @param <T> The type encapsulated by this PObject.
+ */
+public abstract class PObjectImpl<S, T> implements PObject<T> {
+
+ // The underlying PCollection whose contents will be used to generate the value for this
+ // PObject.
+ private PCollection<S> collection;
+
+ // A variable to hold a cached copy of the value of this {@code PObject},
+ // to prevent unnecessary materializations of the backing {@code PCollection}.
+ private T cachedValue;
+
+ // A flag indicating if a value for this {@code PObject} has been cached.
+ private boolean isCached;
+
+ /**
+ * Constructs a new instance of this {@code PObject} implementation.
+ *
+ * @param collect The backing {@code PCollection} for this {@code PObject}.
+ */
+ public PObjectImpl(PCollection<S> collect) {
+ this.collection = collect;
+ this.cachedValue = null;
+ this.isCached = false;
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public String toString() {
+ return collection.toString();
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public final T getValue() {
+ if (!isCached) {
+ cachedValue = process(collection.materialize());
+ isCached = true;
+ }
+ return cachedValue;
+ }
+
+ /**
+ * Transforms the provided Iterable, obtained from the backing {@link PCollection},
+ * into the value encapsulated by this {@code PObject}.
+ *
+ * @param input An Iterable whose elements correspond to those of the backing {@code
+ * PCollection}.
+ * @return The value of this {@code PObject}.
+ */
+ protected abstract T process(Iterable<S> input);
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/package-info.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/package-info.java b/crunch-core/src/main/java/org/apache/crunch/package-info.java
new file mode 100644
index 0000000..38f11bc
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/package-info.java
@@ -0,0 +1,25 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Client-facing API and core abstractions.
+ *
+ * @see <a href="http://crunch.apache.org/intro.html">Introduction to
+ * Apache Crunch</a>
+ */
+package org.apache.crunch;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/CollectionDeepCopier.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/CollectionDeepCopier.java b/crunch-core/src/main/java/org/apache/crunch/types/CollectionDeepCopier.java
new file mode 100644
index 0000000..151ab82
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/CollectionDeepCopier.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Performs deep copies (based on underlying PType deep copying) of Collections.
+ *
+ * @param <T> The type of Tuple implementation being copied
+ */
+public class CollectionDeepCopier<T> implements DeepCopier<Collection<T>> {
+
+ private PType<T> elementType;
+
+ public CollectionDeepCopier(PType<T> elementType) {
+ this.elementType = elementType;
+ }
+
+ @Override
+ public void initialize(Configuration conf) {
+ this.elementType.initialize(conf);
+ }
+
+ @Override
+ public Collection<T> deepCopy(Collection<T> source) {
+ if (source == null) {
+ return null;
+ }
+ List<T> copiedCollection = Lists.newArrayListWithCapacity(source.size());
+ for (T value : source) {
+ copiedCollection.add(elementType.getDetachedValue(value));
+ }
+ return copiedCollection;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/Converter.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/Converter.java b/crunch-core/src/main/java/org/apache/crunch/types/Converter.java
new file mode 100644
index 0000000..a0dbb16
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/Converter.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import java.io.Serializable;
+
+import org.apache.crunch.DoFn;
+
+/**
+ * Converts the input key/value from a MapReduce task into the input to a
+ * {@link DoFn}, or takes the output of a {@code DoFn} and write it to the
+ * output key/values.
+ */
+public interface Converter<K, V, S, T> extends Serializable {
+ S convertInput(K key, V value);
+
+ T convertIterableInput(K key, Iterable<V> value);
+
+ K outputKey(S value);
+
+ V outputValue(S value);
+
+ Class<K> getKeyClass();
+
+ Class<V> getValueClass();
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/DeepCopier.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/DeepCopier.java b/crunch-core/src/main/java/org/apache/crunch/types/DeepCopier.java
new file mode 100644
index 0000000..f146e86
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/DeepCopier.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import java.io.Serializable;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Performs deep copies of values.
+ *
+ * @param <T> The type of value that will be copied
+ */
+public interface DeepCopier<T> extends Serializable {
+
+ /**
+ * Initialize the deep copier with a job-specific configuration
+ *
+ * @param conf Job-specific configuration
+ */
+ void initialize(Configuration conf);
+
+ /**
+ * Create a deep copy of a value.
+ *
+ * @param source The value to be copied
+ * @return The deep copy of the value
+ */
+ T deepCopy(T source);
+
+ static class NoOpDeepCopier<V> implements DeepCopier<V> {
+
+ @Override
+ public V deepCopy(V source) {
+ return source;
+ }
+
+ @Override
+ public void initialize(Configuration conf) {
+ // No initialization needed
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/MapDeepCopier.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/MapDeepCopier.java b/crunch-core/src/main/java/org/apache/crunch/types/MapDeepCopier.java
new file mode 100644
index 0000000..de8903b
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/MapDeepCopier.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.collect.Maps;
+
+public class MapDeepCopier<T> implements DeepCopier<Map<String, T>> {
+
+ private final PType<T> ptype;
+
+ public MapDeepCopier(PType<T> ptype) {
+ this.ptype = ptype;
+ }
+
+ @Override
+ public void initialize(Configuration conf) {
+ this.ptype.initialize(conf);
+ }
+
+ @Override
+ public Map<String, T> deepCopy(Map<String, T> source) {
+ if (source == null) {
+ return null;
+ }
+
+ Map<String, T> deepCopyMap = Maps.newHashMap();
+ for (Entry<String, T> entry : source.entrySet()) {
+ deepCopyMap.put(entry.getKey(), ptype.getDetachedValue(entry.getValue()));
+ }
+ return deepCopyMap;
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/PGroupedTableType.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/PGroupedTableType.java b/crunch-core/src/main/java/org/apache/crunch/types/PGroupedTableType.java
new file mode 100644
index 0000000..d276cd6
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/PGroupedTableType.java
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.crunch.GroupingOptions;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PGroupedTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.io.ReadableSourceTarget;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+
+import com.google.common.collect.Iterables;
+
+/**
+ * The {@code PType} instance for {@link PGroupedTable} instances. Its settings
+ * are derived from the {@code PTableType} that was grouped to create the
+ * {@code PGroupedTable} instance.
+ *
+ */
+public abstract class PGroupedTableType<K, V> implements PType<Pair<K, Iterable<V>>> {
+
+ protected static class PTypeIterable<V> implements Iterable<V> {
+ private final Iterable<Object> iterable;
+ private final MapFn<Object, V> mapFn;
+
+ public PTypeIterable(MapFn<Object, V> mapFn, Iterable<Object> iterable) {
+ this.mapFn = mapFn;
+ this.iterable = iterable;
+ }
+
+ public Iterator<V> iterator() {
+ return new Iterator<V>() {
+ Iterator<Object> iter = iterable.iterator();
+
+ public boolean hasNext() {
+ return iter.hasNext();
+ }
+
+ public V next() {
+ return mapFn.map(iter.next());
+ }
+
+ public void remove() {
+ iter.remove();
+ }
+ };
+ }
+
+ @Override
+ public String toString() {
+ return Iterables.toString(this);
+ }
+ }
+
+ public static class PairIterableMapFn<K, V> extends MapFn<Pair<Object, Iterable<Object>>, Pair<K, Iterable<V>>> {
+ private final MapFn<Object, K> keys;
+ private final MapFn<Object, V> values;
+
+ public PairIterableMapFn(MapFn<Object, K> keys, MapFn<Object, V> values) {
+ this.keys = keys;
+ this.values = values;
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ keys.configure(conf);
+ values.configure(conf);
+ }
+
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ keys.setContext(context);
+ values.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ keys.initialize();
+ values.initialize();
+ }
+
+ @Override
+ public Pair<K, Iterable<V>> map(Pair<Object, Iterable<Object>> input) {
+ return Pair.<K, Iterable<V>> of(keys.map(input.first()), new PTypeIterable(values, input.second()));
+ }
+ }
+
+ protected final PTableType<K, V> tableType;
+
+ public PGroupedTableType(PTableType<K, V> tableType) {
+ this.tableType = tableType;
+ }
+
+ public PTableType<K, V> getTableType() {
+ return tableType;
+ }
+
+ @Override
+ public PTypeFamily getFamily() {
+ return tableType.getFamily();
+ }
+
+ @Override
+ public List<PType> getSubTypes() {
+ return tableType.getSubTypes();
+ }
+
+ @Override
+ public Converter getConverter() {
+ return tableType.getConverter();
+ }
+
+ public abstract Converter getGroupingConverter();
+
+ public abstract void configureShuffle(Job job, GroupingOptions options);
+
+ @Override
+ public ReadableSourceTarget<Pair<K, Iterable<V>>> getDefaultFileSource(Path path) {
+ throw new UnsupportedOperationException("Grouped tables cannot be written out directly");
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/PTableType.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/PTableType.java b/crunch-core/src/main/java/org/apache/crunch/types/PTableType.java
new file mode 100644
index 0000000..3d06f8b
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/PTableType.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+
+/**
+ * An extension of {@code PType} specifically for {@link PTable} objects. It
+ * allows separate access to the {@code PType}s of the key and value for the
+ * {@code PTable}.
+ *
+ */
+public interface PTableType<K, V> extends PType<Pair<K, V>> {
+ /**
+ * Returns the key type for the table.
+ */
+ PType<K> getKeyType();
+
+ /**
+ * Returns the value type for the table.
+ */
+ PType<V> getValueType();
+
+ /**
+ * Returns the grouped table version of this type.
+ */
+ PGroupedTableType<K, V> getGroupedTableType();
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/PType.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/PType.java b/crunch-core/src/main/java/org/apache/crunch/types/PType.java
new file mode 100644
index 0000000..ebddf84
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/PType.java
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import java.io.Serializable;
+import java.util.List;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.io.ReadableSourceTarget;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * A {@code PType} defines a mapping between a data type that is used in a Crunch pipeline and a
+ * serialization and storage format that is used to read/write data from/to HDFS. Every
+ * {@link PCollection} has an associated {@code PType} that tells Crunch how to read/write data from
+ * that {@code PCollection}.
+ *
+ */
+public interface PType<T> extends Serializable {
+ /**
+ * Returns the Java type represented by this {@code PType}.
+ */
+ Class<T> getTypeClass();
+
+ /**
+ * Returns the {@code PTypeFamily} that this {@code PType} belongs to.
+ */
+ PTypeFamily getFamily();
+
+ MapFn<Object, T> getInputMapFn();
+
+ MapFn<T, Object> getOutputMapFn();
+
+ Converter getConverter();
+
+ /**
+ * Initialize this PType for use within a DoFn. This generally only needs to be called when using
+ * a PType for {@link #getDetachedValue(Object)}.
+ *
+ * @param conf Configuration object
+ * @see PType#getDetachedValue(Object)
+ */
+ void initialize(Configuration conf);
+
+ /**
+ * Returns a copy of a value (or the value itself) that can safely be retained.
+ * <p>
+ * This is useful when iterable values being processed in a DoFn (via a reducer) need to be held
+ * on to for more than the scope of a single iteration, as a reducer (and therefore also a DoFn
+ * that has an Iterable as input) re-use deserialized values. More information on object reuse is
+ * available in the {@link DoFn} class documentation.
+ *
+ * @param value The value to be deep-copied
+ * @return A deep copy of the input value
+ */
+ T getDetachedValue(T value);
+
+ /**
+ * Returns a {@code SourceTarget} that is able to read/write data using the serialization format
+ * specified by this {@code PType}.
+ */
+ ReadableSourceTarget<T> getDefaultFileSource(Path path);
+
+ /**
+ * Returns the sub-types that make up this PType if it is a composite instance, such as a tuple.
+ */
+ List<PType> getSubTypes();
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/PTypeFamily.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/PTypeFamily.java b/crunch-core/src/main/java/org/apache/crunch/types/PTypeFamily.java
new file mode 100644
index 0000000..9458f14
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/PTypeFamily.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.Map;
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Tuple;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+
+/**
+ * An abstract factory for creating {@code PType} instances that have the same
+ * serialization/storage backing format.
+ *
+ */
+public interface PTypeFamily {
+ PType<Void> nulls();
+
+ PType<String> strings();
+
+ PType<Long> longs();
+
+ PType<Integer> ints();
+
+ PType<Float> floats();
+
+ PType<Double> doubles();
+
+ PType<Boolean> booleans();
+
+ PType<ByteBuffer> bytes();
+
+ <T> PType<T> records(Class<T> clazz);
+
+ <T> PType<Collection<T>> collections(PType<T> ptype);
+
+ <T> PType<Map<String, T>> maps(PType<T> ptype);
+
+ <V1, V2> PType<Pair<V1, V2>> pairs(PType<V1> p1, PType<V2> p2);
+
+ <V1, V2, V3> PType<Tuple3<V1, V2, V3>> triples(PType<V1> p1, PType<V2> p2, PType<V3> p3);
+
+ <V1, V2, V3, V4> PType<Tuple4<V1, V2, V3, V4>> quads(PType<V1> p1, PType<V2> p2, PType<V3> p3, PType<V4> p4);
+
+ PType<TupleN> tuples(PType<?>... ptypes);
+
+ <T extends Tuple> PType<T> tuples(Class<T> clazz, PType<?>... ptypes);
+
+ <S, T> PType<T> derived(Class<T> clazz, MapFn<S, T> inputFn, MapFn<T, S> outputFn, PType<S> base);
+
+ <K, V> PTableType<K, V> tableOf(PType<K> key, PType<V> value);
+
+ /**
+ * Returns the equivalent of the given ptype for this family, if it exists.
+ */
+ <T> PType<T> as(PType<T> ptype);
+}
[29/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileTarget.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileTarget.java b/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileTarget.java
new file mode 100644
index 0000000..0c3e6a4
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileTarget.java
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.text;
+
+import org.apache.avro.Schema;
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.io.FileNamingScheme;
+import org.apache.crunch.io.SequentialFileNamingScheme;
+import org.apache.crunch.io.impl.FileTargetImpl;
+import org.apache.crunch.types.Converter;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.avro.AvroTextOutputFormat;
+import org.apache.crunch.types.avro.AvroType;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.writable.WritableType;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+
+public class TextFileTarget extends FileTargetImpl {
+ private static Class<? extends FileOutputFormat> getOutputFormat(PType<?> ptype) {
+ if (ptype.getFamily().equals(AvroTypeFamily.getInstance())) {
+ return AvroTextOutputFormat.class;
+ } else {
+ return TextOutputFormat.class;
+ }
+ }
+
+ public <T> TextFileTarget(String path) {
+ this(new Path(path));
+ }
+
+ public <T> TextFileTarget(Path path) {
+ this(path, new SequentialFileNamingScheme());
+ }
+
+ public <T> TextFileTarget(Path path, FileNamingScheme fileNamingScheme) {
+ super(path, null, fileNamingScheme);
+ }
+
+ @Override
+ public Path getPath() {
+ return path;
+ }
+
+ @Override
+ public String toString() {
+ return "Text(" + path + ")";
+ }
+
+ @Override
+ public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) {
+ Converter converter = ptype.getConverter();
+ Class keyClass = converter.getKeyClass();
+ Class valueClass = converter.getValueClass();
+ configureForMapReduce(job, keyClass, valueClass, getOutputFormat(ptype), outputPath, name);
+ }
+
+ @Override
+ public <T> SourceTarget<T> asSourceTarget(PType<T> ptype) {
+ if (!isTextCompatible(ptype)) {
+ return null;
+ }
+ if (ptype instanceof PTableType) {
+ return new TextFileTableSourceTarget(path, (PTableType) ptype);
+ }
+ return new TextFileSourceTarget<T>(path, ptype);
+ }
+
+ private <T> boolean isTextCompatible(PType<T> ptype) {
+ if (AvroTypeFamily.getInstance().equals(ptype.getFamily())) {
+ AvroType<T> at = (AvroType<T>) ptype;
+ if (at.getSchema().equals(Schema.create(Schema.Type.STRING))) {
+ return true;
+ }
+ } else if (WritableTypeFamily.getInstance().equals(ptype.getFamily())) {
+ if (ptype instanceof PTableType) {
+ PTableType ptt = (PTableType) ptype;
+ return isText(ptt.getKeyType()) && isText(ptt.getValueType());
+ } else {
+ return isText(ptype);
+ }
+ }
+ return false;
+ }
+
+ private <T> boolean isText(PType<T> wtype) {
+ return Text.class.equals(((WritableType) wtype).getSerializationClass());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/Aggregate.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/Aggregate.java b/crunch-core/src/main/java/org/apache/crunch/lib/Aggregate.java
new file mode 100644
index 0000000..d4109cc
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/Aggregate.java
@@ -0,0 +1,272 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.PriorityQueue;
+
+import org.apache.crunch.CombineFn;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.GroupingOptions;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PObject;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.fn.Aggregators;
+import org.apache.crunch.fn.MapValuesFn;
+import org.apache.crunch.materialize.pobject.FirstElementPObject;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Methods for performing various types of aggregations over {@link PCollection} instances.
+ *
+ */
+public class Aggregate {
+
+ /**
+ * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count
+ * of their occurrences.
+ */
+ public static <S> PTable<S, Long> count(PCollection<S> collect) {
+ PTypeFamily tf = collect.getTypeFamily();
+ return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() {
+ public Pair<S, Long> map(S input) {
+ return Pair.of(input, 1L);
+ }
+ }, tf.tableOf(collect.getPType(), tf.longs())).groupByKey()
+ .combineValues(Aggregators.SUM_LONGS());
+ }
+
+ /**
+ * Returns the number of elements in the provided PCollection.
+ *
+ * @param collect The PCollection whose elements should be counted.
+ * @param <S> The type of the PCollection.
+ * @return A {@code PObject} containing the number of elements in the {@code PCollection}.
+ */
+ public static <S> PObject<Long> length(PCollection<S> collect) {
+ PTypeFamily tf = collect.getTypeFamily();
+ PTable<Integer, Long> countTable = collect
+ .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() {
+ public Pair<Integer, Long> map(S input) {
+ return Pair.of(1, 1L);
+ }
+ }, tf.tableOf(tf.ints(), tf.longs()))
+ .groupByKey(GroupingOptions.builder().numReducers(1).build())
+ .combineValues(Aggregators.SUM_LONGS());
+ PCollection<Long> count = countTable.values();
+ return new FirstElementPObject<Long>(count);
+ }
+
+ public static class PairValueComparator<K, V> implements Comparator<Pair<K, V>> {
+ private final boolean ascending;
+
+ public PairValueComparator(boolean ascending) {
+ this.ascending = ascending;
+ }
+
+ @Override
+ public int compare(Pair<K, V> left, Pair<K, V> right) {
+ int cmp = ((Comparable<V>) left.second()).compareTo(right.second());
+ return ascending ? cmp : -cmp;
+ }
+ }
+
+ public static class TopKFn<K, V> extends DoFn<Pair<K, V>, Pair<Integer, Pair<K, V>>> {
+
+ private final int limit;
+ private final boolean maximize;
+ private transient PriorityQueue<Pair<K, V>> values;
+
+ public TopKFn(int limit, boolean ascending) {
+ this.limit = limit;
+ this.maximize = ascending;
+ }
+
+ public void initialize() {
+ this.values = new PriorityQueue<Pair<K, V>>(limit, new PairValueComparator<K, V>(maximize));
+ }
+
+ public void process(Pair<K, V> input, Emitter<Pair<Integer, Pair<K, V>>> emitter) {
+ values.add(input);
+ if (values.size() > limit) {
+ values.poll();
+ }
+ }
+
+ public void cleanup(Emitter<Pair<Integer, Pair<K, V>>> emitter) {
+ for (Pair<K, V> p : values) {
+ emitter.emit(Pair.of(0, p));
+ }
+ }
+ }
+
+ public static class TopKCombineFn<K, V> extends CombineFn<Integer, Pair<K, V>> {
+
+ private final int limit;
+ private final boolean maximize;
+
+ public TopKCombineFn(int limit, boolean maximize) {
+ this.limit = limit;
+ this.maximize = maximize;
+ }
+
+ @Override
+ public void process(Pair<Integer, Iterable<Pair<K, V>>> input,
+ Emitter<Pair<Integer, Pair<K, V>>> emitter) {
+ Comparator<Pair<K, V>> cmp = new PairValueComparator<K, V>(maximize);
+ PriorityQueue<Pair<K, V>> queue = new PriorityQueue<Pair<K, V>>(limit, cmp);
+ for (Pair<K, V> pair : input.second()) {
+ queue.add(pair);
+ if (queue.size() > limit) {
+ queue.poll();
+ }
+ }
+
+ List<Pair<K, V>> values = Lists.newArrayList(queue);
+ Collections.sort(values, cmp);
+ for (int i = values.size() - 1; i >= 0; i--) {
+ emitter.emit(Pair.of(0, values.get(i)));
+ }
+ }
+ }
+
+ public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) {
+ PTypeFamily ptf = ptable.getTypeFamily();
+ PTableType<K, V> base = ptable.getPTableType();
+ PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType());
+ PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType);
+ return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize), inter)
+ .groupByKey(1).combineValues(new TopKCombineFn<K, V>(limit, maximize))
+ .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() {
+ public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) {
+ emitter.emit(input.second());
+ }
+ }, base);
+ }
+
+ /**
+ * Returns the largest numerical element from the input collection.
+ */
+ public static <S> PObject<S> max(PCollection<S> collect) {
+ Class<S> clazz = collect.getPType().getTypeClass();
+ if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
+ throw new IllegalArgumentException("Can only get max for Comparable elements, not for: "
+ + collect.getPType().getTypeClass());
+ }
+ PTypeFamily tf = collect.getTypeFamily();
+ PCollection<S> maxCollect = PTables.values(collect
+ .parallelDo("max", new DoFn<S, Pair<Boolean, S>>() {
+ private transient S max = null;
+
+ public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
+ if (max == null || ((Comparable<S>) max).compareTo(input) < 0) {
+ max = input;
+ }
+ }
+
+ public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
+ if (max != null) {
+ emitter.emit(Pair.of(true, max));
+ }
+ }
+ }, tf.tableOf(tf.booleans(), collect.getPType())).groupByKey(1)
+ .combineValues(new CombineFn<Boolean, S>() {
+ public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
+ S max = null;
+ for (S v : input.second()) {
+ if (max == null || ((Comparable<S>) max).compareTo(v) < 0) {
+ max = v;
+ }
+ }
+ emitter.emit(Pair.of(input.first(), max));
+ }
+ }));
+ return new FirstElementPObject<S>(maxCollect);
+ }
+
+ /**
+ * Returns the smallest numerical element from the input collection.
+ */
+ public static <S> PObject<S> min(PCollection<S> collect) {
+ Class<S> clazz = collect.getPType().getTypeClass();
+ if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
+ throw new IllegalArgumentException("Can only get min for Comparable elements, not for: "
+ + collect.getPType().getTypeClass());
+ }
+ PTypeFamily tf = collect.getTypeFamily();
+ PCollection<S> minCollect = PTables.values(collect
+ .parallelDo("min", new DoFn<S, Pair<Boolean, S>>() {
+ private transient S min = null;
+
+ public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
+ if (min == null || ((Comparable<S>) min).compareTo(input) > 0) {
+ min = input;
+ }
+ }
+
+ public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
+ if (min != null) {
+ emitter.emit(Pair.of(false, min));
+ }
+ }
+ }, tf.tableOf(tf.booleans(), collect.getPType())).groupByKey(1)
+ .combineValues(new CombineFn<Boolean, S>() {
+ public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
+ S min = null;
+ for (S v : input.second()) {
+ if (min == null || ((Comparable<S>) min).compareTo(v) > 0) {
+ min = v;
+ }
+ }
+ emitter.emit(Pair.of(input.first(), min));
+ }
+ }));
+ return new FirstElementPObject<S>(minCollect);
+ }
+
+ public static <K, V> PTable<K, Collection<V>> collectValues(PTable<K, V> collect) {
+ PTypeFamily tf = collect.getTypeFamily();
+ final PType<V> valueType = collect.getValueType();
+ return collect.groupByKey().parallelDo("collect",
+ new MapValuesFn<K, Iterable<V>, Collection<V>>() {
+
+ @Override
+ public void initialize() {
+ valueType.initialize(getConfiguration());
+ }
+
+ public Collection<V> map(Iterable<V> values) {
+ List<V> collected = Lists.newArrayList();
+ for (V value : values) {
+ collected.add(valueType.getDetachedValue(value));
+ }
+ return collected;
+ }
+ }, tf.tableOf(collect.getKeyType(), tf.collections(collect.getValueType())));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/Cartesian.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/Cartesian.java b/crunch-core/src/main/java/org/apache/crunch/lib/Cartesian.java
new file mode 100644
index 0000000..08327dd
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/Cartesian.java
@@ -0,0 +1,216 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import java.util.Random;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PTypeFamily;
+
+/**
+ * Utilities for Cartesian products of two {@code PTable} or {@code PCollection}
+ * instances.
+ */
+@SuppressWarnings("serial")
+public class Cartesian {
+
+ /**
+ * Helper for building the artificial cross keys. This technique was taken
+ * from Pig's CROSS.
+ */
+ private static class GFCross<V> extends DoFn<V, Pair<Pair<Integer, Integer>, V>> {
+
+ private final int constantField;
+ private final int parallelism;
+ private final Random r;
+
+ public GFCross(int constantField, int parallelism) {
+ this.constantField = constantField;
+ this.parallelism = parallelism;
+ this.r = new Random();
+ }
+
+ public void process(V input, Emitter<Pair<Pair<Integer, Integer>, V>> emitter) {
+ int c = r.nextInt(parallelism);
+ if (constantField == 0) {
+ for (int i = 0; i < parallelism; i++) {
+ emitter.emit(Pair.of(Pair.of(c, i), input));
+ }
+ } else {
+ for (int i = 0; i < parallelism; i++) {
+ emitter.emit(Pair.of(Pair.of(i, c), input));
+ }
+ }
+ }
+ }
+
+ static final int DEFAULT_PARALLELISM = 6;
+
+ /**
+ * Performs a full cross join on the specified {@link PTable}s (using the same
+ * strategy as Pig's CROSS operator).
+ *
+ * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross
+ * Join</a>
+ * @param left
+ * A PTable to perform a cross join on.
+ * @param right
+ * A PTable to perform a cross join on.
+ * @param <K1>
+ * Type of left PTable's keys.
+ * @param <K2>
+ * Type of right PTable's keys.
+ * @param <U>
+ * Type of the first {@link PTable}'s values
+ * @param <V>
+ * Type of the second {@link PTable}'s values
+ * @return The joined result as tuples of ((K1,K2), (U,V)).
+ */
+ public static <K1, K2, U, V> PTable<Pair<K1, K2>, Pair<U, V>> cross(PTable<K1, U> left, PTable<K2, V> right) {
+ return cross(left, right, DEFAULT_PARALLELISM);
+ }
+
+ /**
+ * Performs a full cross join on the specified {@link PTable}s (using the same
+ * strategy as Pig's CROSS operator).
+ *
+ * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross
+ * Join</a>
+ * @param left
+ * A PTable to perform a cross join on.
+ * @param right
+ * A PTable to perform a cross join on.
+ * @param parallelism
+ * The square root of the number of reducers to use. Increasing
+ * parallelism also increases copied data.
+ * @param <K1>
+ * Type of left PTable's keys.
+ * @param <K2>
+ * Type of right PTable's keys.
+ * @param <U>
+ * Type of the first {@link PTable}'s values
+ * @param <V>
+ * Type of the second {@link PTable}'s values
+ * @return The joined result as tuples of ((K1,K2), (U,V)).
+ */
+ public static <K1, K2, U, V> PTable<Pair<K1, K2>, Pair<U, V>> cross(PTable<K1, U> left, PTable<K2, V> right,
+ int parallelism) {
+
+ /*
+ * The strategy here is to simply emulate the following PigLatin: A =
+ * foreach table1 generate flatten(GFCross(0, 2)), flatten(*); B = foreach
+ * table2 generate flatten(GFCross(1, 2)), flatten(*); C = cogroup A by ($0,
+ * $1), B by ($0, $1); result = foreach C generate flatten(A), flatten(B);
+ */
+
+ PTypeFamily ltf = left.getTypeFamily();
+ PTypeFamily rtf = right.getTypeFamily();
+
+ PTable<Pair<Integer, Integer>, Pair<K1, U>> leftCross = left.parallelDo(new GFCross<Pair<K1, U>>(0, parallelism),
+ ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), ltf.pairs(left.getKeyType(), left.getValueType())));
+ PTable<Pair<Integer, Integer>, Pair<K2, V>> rightCross = right.parallelDo(new GFCross<Pair<K2, V>>(1, parallelism),
+ rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), rtf.pairs(right.getKeyType(), right.getValueType())));
+
+ PTable<Pair<Integer, Integer>, Pair<Pair<K1, U>, Pair<K2, V>>> cg = leftCross.join(rightCross);
+
+ PTypeFamily ctf = cg.getTypeFamily();
+
+ return cg.parallelDo(
+ new MapFn<Pair<Pair<Integer, Integer>, Pair<Pair<K1, U>, Pair<K2, V>>>, Pair<Pair<K1, K2>, Pair<U, V>>>() {
+
+ @Override
+ public Pair<Pair<K1, K2>, Pair<U, V>> map(Pair<Pair<Integer, Integer>, Pair<Pair<K1, U>, Pair<K2, V>>> input) {
+ Pair<Pair<K1, U>, Pair<K2, V>> valuePair = input.second();
+ return Pair.of(Pair.of(valuePair.first().first(), valuePair.second().first()),
+ Pair.of(valuePair.first().second(), valuePair.second().second()));
+ }
+ },
+ ctf.tableOf(ctf.pairs(left.getKeyType(), right.getKeyType()),
+ ctf.pairs(left.getValueType(), right.getValueType())));
+ }
+
+ /**
+ * Performs a full cross join on the specified {@link PCollection}s (using the
+ * same strategy as Pig's CROSS operator).
+ *
+ * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross
+ * Join</a>
+ * @param left
+ * A PCollection to perform a cross join on.
+ * @param right
+ * A PCollection to perform a cross join on.
+ * @param <U>
+ * Type of the first {@link PCollection}'s values
+ * @param <V>
+ * Type of the second {@link PCollection}'s values
+ * @return The joined result as tuples of (U,V).
+ */
+ public static <U, V> PCollection<Pair<U, V>> cross(PCollection<U> left, PCollection<V> right) {
+ return cross(left, right, DEFAULT_PARALLELISM);
+ }
+
+ /**
+ * Performs a full cross join on the specified {@link PCollection}s (using the
+ * same strategy as Pig's CROSS operator).
+ *
+ * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross
+ * Join</a>
+ * @param left
+ * A PCollection to perform a cross join on.
+ * @param right
+ * A PCollection to perform a cross join on.
+ * @param <U>
+ * Type of the first {@link PCollection}'s values
+ * @param <V>
+ * Type of the second {@link PCollection}'s values
+ * @return The joined result as tuples of (U,V).
+ */
+ public static <U, V> PCollection<Pair<U, V>> cross(PCollection<U> left, PCollection<V> right, int parallelism) {
+
+ PTypeFamily ltf = left.getTypeFamily();
+ PTypeFamily rtf = right.getTypeFamily();
+
+ PTableType<Pair<Integer, Integer>, U> ptt = ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), left.getPType());
+
+ if (ptt == null)
+ throw new Error();
+
+ PTable<Pair<Integer, Integer>, U> leftCross = left.parallelDo(new GFCross<U>(0, parallelism),
+ ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), left.getPType()));
+ PTable<Pair<Integer, Integer>, V> rightCross = right.parallelDo(new GFCross<V>(1, parallelism),
+ rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), right.getPType()));
+
+ PTable<Pair<Integer, Integer>, Pair<U, V>> cg = leftCross.join(rightCross);
+
+ PTypeFamily ctf = cg.getTypeFamily();
+
+ return cg.parallelDo(new MapFn<Pair<Pair<Integer, Integer>, Pair<U, V>>, Pair<U, V>>() {
+ @Override
+ public Pair<U, V> map(Pair<Pair<Integer, Integer>, Pair<U, V>> input) {
+ return input.second();
+ }
+ }, ctf.pairs(left.getPType(), right.getPType()));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/Cogroup.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/Cogroup.java b/crunch-core/src/main/java/org/apache/crunch/lib/Cogroup.java
new file mode 100644
index 0000000..07d873c
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/Cogroup.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import java.util.Collection;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.fn.MapValuesFn;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+
+import com.google.common.collect.Lists;
+
+public class Cogroup {
+
+ /**
+ * Co-groups the two {@link PTable} arguments.
+ *
+ * @return a {@code PTable} representing the co-grouped tables.
+ */
+ public static <K, U, V> PTable<K, Pair<Collection<U>, Collection<V>>> cogroup(PTable<K, U> left, PTable<K, V> right) {
+ PTypeFamily ptf = left.getTypeFamily();
+ PType<K> keyType = left.getPTableType().getKeyType();
+ PType<U> leftType = left.getPTableType().getValueType();
+ PType<V> rightType = right.getPTableType().getValueType();
+ PType<Pair<U, V>> itype = ptf.pairs(leftType, rightType);
+
+ PTable<K, Pair<U, V>> cgLeft = left.parallelDo("coGroupTag1", new CogroupFn1<K, U, V>(),
+ ptf.tableOf(keyType, itype));
+ PTable<K, Pair<U, V>> cgRight = right.parallelDo("coGroupTag2", new CogroupFn2<K, U, V>(),
+ ptf.tableOf(keyType, itype));
+
+ PTable<K, Pair<U, V>> both = cgLeft.union(cgRight);
+
+ PType<Pair<Collection<U>, Collection<V>>> otype = ptf.pairs(ptf.collections(leftType), ptf.collections(rightType));
+ return both.groupByKey().parallelDo("cogroup",
+ new PostGroupFn<K, U, V>(leftType, rightType), ptf.tableOf(keyType, otype));
+ }
+
+ private static class CogroupFn1<K, V, U> extends MapValuesFn<K, V, Pair<V, U>> {
+ @Override
+ public Pair<V, U> map(V v) {
+ return Pair.of(v, null);
+ }
+ }
+
+ private static class CogroupFn2<K, V, U> extends MapValuesFn<K, U, Pair<V, U>> {
+ @Override
+ public Pair<V, U> map(U u) {
+ return Pair.of(null, u);
+ }
+ }
+
+ private static class PostGroupFn<K, V, U> extends
+ DoFn<Pair<K, Iterable<Pair<V, U>>>, Pair<K, Pair<Collection<V>, Collection<U>>>> {
+
+ private PType<V> ptypeV;
+ private PType<U> ptypeU;
+
+ public PostGroupFn(PType<V> ptypeV, PType<U> ptypeU) {
+ this.ptypeV = ptypeV;
+ this.ptypeU = ptypeU;
+ }
+
+ @Override
+ public void initialize() {
+ super.initialize();
+ ptypeV.initialize(getConfiguration());
+ ptypeU.initialize(getConfiguration());
+ }
+
+ @Override
+ public void process(Pair<K, Iterable<Pair<V, U>>> input,
+ Emitter<Pair<K, Pair<Collection<V>, Collection<U>>>> emitter) {
+ Collection<V> cv = Lists.newArrayList();
+ Collection<U> cu = Lists.newArrayList();
+ for (Pair<V, U> pair : input.second()) {
+ if (pair.first() != null) {
+ cv.add(ptypeV.getDetachedValue(pair.first()));
+ } else if (pair.second() != null) {
+ cu.add(ptypeU.getDetachedValue(pair.second()));
+ }
+ }
+ emitter.emit(Pair.of(input.first(), Pair.of(cv, cu)));
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/Distinct.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/Distinct.java b/crunch-core/src/main/java/org/apache/crunch/lib/Distinct.java
new file mode 100644
index 0000000..994830d
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/Distinct.java
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import java.util.Set;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Sets;
+
+/**
+ * Functions for computing the distinct elements of a {@code PCollection}.
+ */
+public final class Distinct {
+
+ private static final int DEFAULT_FLUSH_EVERY = 50000;
+
+ /**
+ * Construct a new {@code PCollection} that contains the unique elements of a
+ * given input {@code PCollection}.
+ *
+ * @param input The input {@code PCollection}
+ * @return A new {@code PCollection} that contains the unique elements of the input
+ */
+ public static <S> PCollection<S> distinct(PCollection<S> input) {
+ return distinct(input, DEFAULT_FLUSH_EVERY);
+ }
+
+ /**
+ * A {@code PTable<K, V>} analogue of the {@code distinct} function.
+ */
+ public static <K, V> PTable<K, V> distinct(PTable<K, V> input) {
+ return PTables.asPTable(distinct((PCollection<Pair<K, V>>) input));
+ }
+
+ /**
+ * A {@code distinct} operation that gives the client more control over how frequently
+ * elements are flushed to disk in order to allow control over performance or
+ * memory consumption.
+ *
+ * @param input The input {@code PCollection}
+ * @param flushEvery Flush the elements to disk whenever we encounter this many unique values
+ * @return A new {@code PCollection} that contains the unique elements of the input
+ */
+ public static <S> PCollection<S> distinct(PCollection<S> input, int flushEvery) {
+ Preconditions.checkArgument(flushEvery > 0);
+ PType<S> pt = input.getPType();
+ PTypeFamily ptf = pt.getFamily();
+ return input
+ .parallelDo("pre-distinct", new PreDistinctFn<S>(flushEvery, pt), ptf.tableOf(pt, ptf.nulls()))
+ .groupByKey()
+ .parallelDo("post-distinct", new PostDistinctFn<S>(), pt);
+ }
+
+ /**
+ * A {@code PTable<K, V>} analogue of the {@code distinct} function.
+ */
+ public static <K, V> PTable<K, V> distinct(PTable<K, V> input, int flushEvery) {
+ return PTables.asPTable(distinct((PCollection<Pair<K, V>>) input, flushEvery));
+ }
+
+ private static class PreDistinctFn<S> extends DoFn<S, Pair<S, Void>> {
+ private final Set<S> values = Sets.newHashSet();
+ private final int flushEvery;
+ private final PType<S> ptype;
+
+ public PreDistinctFn(int flushEvery, PType<S> ptype) {
+ this.flushEvery = flushEvery;
+ this.ptype = ptype;
+ }
+
+ @Override
+ public void initialize() {
+ super.initialize();
+ ptype.initialize(getConfiguration());
+ }
+
+ @Override
+ public void process(S input, Emitter<Pair<S, Void>> emitter) {
+ values.add(ptype.getDetachedValue(input));
+ if (values.size() > flushEvery) {
+ cleanup(emitter);
+ }
+ }
+
+ @Override
+ public void cleanup(Emitter<Pair<S, Void>> emitter) {
+ for (S in : values) {
+ emitter.emit(Pair.<S, Void>of(in, null));
+ }
+ values.clear();
+ }
+ }
+
+ private static class PostDistinctFn<S> extends DoFn<Pair<S, Iterable<Void>>, S> {
+ @Override
+ public void process(Pair<S, Iterable<Void>> input, Emitter<S> emitter) {
+ emitter.emit(input.first());
+ }
+ }
+
+ // No instantiation
+ private Distinct() {}
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/Join.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/Join.java b/crunch-core/src/main/java/org/apache/crunch/lib/Join.java
new file mode 100644
index 0000000..c0c4a6b
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/Join.java
@@ -0,0 +1,181 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import org.apache.crunch.GroupingOptions;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PGroupedTable;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.lib.join.FullOuterJoinFn;
+import org.apache.crunch.lib.join.InnerJoinFn;
+import org.apache.crunch.lib.join.JoinFn;
+import org.apache.crunch.lib.join.JoinUtils;
+import org.apache.crunch.lib.join.LeftOuterJoinFn;
+import org.apache.crunch.lib.join.RightOuterJoinFn;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PTypeFamily;
+
+/**
+ * Utilities for joining multiple {@code PTable} instances based on a common
+ * lastKey.
+ */
+public class Join {
+ /**
+ * Performs an inner join on the specified {@link PTable}s.
+ *
+ * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Inner_join">Inner
+ * Join</a>
+ * @param left
+ * A PTable to perform an inner join on.
+ * @param right
+ * A PTable to perform an inner join on.
+ * @param <K>
+ * Type of the keys.
+ * @param <U>
+ * Type of the first {@link PTable}'s values
+ * @param <V>
+ * Type of the second {@link PTable}'s values
+ * @return The joined result.
+ */
+ public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right) {
+ return innerJoin(left, right);
+ }
+
+ /**
+ * Performs an inner join on the specified {@link PTable}s.
+ *
+ * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Inner_join">Inner
+ * Join</a>
+ * @param left
+ * A PTable to perform an inner join on.
+ * @param right
+ * A PTable to perform an inner join on.
+ * @param <K>
+ * Type of the keys.
+ * @param <U>
+ * Type of the first {@link PTable}'s values
+ * @param <V>
+ * Type of the second {@link PTable}'s values
+ * @return The joined result.
+ */
+ public static <K, U, V> PTable<K, Pair<U, V>> innerJoin(PTable<K, U> left, PTable<K, V> right) {
+ return join(left, right, new InnerJoinFn<K, U, V>(left.getKeyType(), left.getValueType()));
+ }
+
+ /**
+ * Performs a left outer join on the specified {@link PTable}s.
+ *
+ * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Left_outer_join">Left
+ * Join</a>
+ * @param left
+ * A PTable to perform an left join on. All of this PTable's entries
+ * will appear in the resulting PTable.
+ * @param right
+ * A PTable to perform an left join on.
+ * @param <K>
+ * Type of the keys.
+ * @param <U>
+ * Type of the first {@link PTable}'s values
+ * @param <V>
+ * Type of the second {@link PTable}'s values
+ * @return The joined result.
+ */
+ public static <K, U, V> PTable<K, Pair<U, V>> leftJoin(PTable<K, U> left, PTable<K, V> right) {
+ return join(left, right, new LeftOuterJoinFn<K, U, V>(left.getKeyType(), left.getValueType()));
+ }
+
+ /**
+ * Performs a right outer join on the specified {@link PTable}s.
+ *
+ * @see <a
+ * href="http://en.wikipedia.org/wiki/Join_(SQL)#Right_outer_join">Right
+ * Join</a>
+ * @param left
+ * A PTable to perform an right join on.
+ * @param right
+ * A PTable to perform an right join on. All of this PTable's entries
+ * will appear in the resulting PTable.
+ * @param <K>
+ * Type of the keys.
+ * @param <U>
+ * Type of the first {@link PTable}'s values
+ * @param <V>
+ * Type of the second {@link PTable}'s values
+ * @return The joined result.
+ */
+ public static <K, U, V> PTable<K, Pair<U, V>> rightJoin(PTable<K, U> left, PTable<K, V> right) {
+ return join(left, right, new RightOuterJoinFn<K, U, V>(left.getKeyType(), left.getValueType()));
+ }
+
+ /**
+ * Performs a full outer join on the specified {@link PTable}s.
+ *
+ * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Full_outer_join">Full
+ * Join</a>
+ * @param left
+ * A PTable to perform an full join on.
+ * @param right
+ * A PTable to perform an full join on.
+ * @param <K>
+ * Type of the keys.
+ * @param <U>
+ * Type of the first {@link PTable}'s values
+ * @param <V>
+ * Type of the second {@link PTable}'s values
+ * @return The joined result.
+ */
+ public static <K, U, V> PTable<K, Pair<U, V>> fullJoin(PTable<K, U> left, PTable<K, V> right) {
+ return join(left, right, new FullOuterJoinFn<K, U, V>(left.getKeyType(), left.getValueType()));
+ }
+
+ public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) {
+ PTypeFamily ptf = left.getTypeFamily();
+ PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right);
+ PTableType<K, Pair<U, V>> ret = ptf
+ .tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType()));
+
+ return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret);
+ }
+
+ private static <K, U, V> PGroupedTable<Pair<K, Integer>, Pair<U, V>> preJoin(PTable<K, U> left, PTable<K, V> right) {
+ PTypeFamily ptf = left.getTypeFamily();
+ PTableType<Pair<K, Integer>, Pair<U, V>> ptt = ptf.tableOf(ptf.pairs(left.getKeyType(), ptf.ints()),
+ ptf.pairs(left.getValueType(), right.getValueType()));
+
+ PTable<Pair<K, Integer>, Pair<U, V>> tag1 = left.parallelDo("joinTagLeft",
+ new MapFn<Pair<K, U>, Pair<Pair<K, Integer>, Pair<U, V>>>() {
+ @Override
+ public Pair<Pair<K, Integer>, Pair<U, V>> map(Pair<K, U> input) {
+ return Pair.of(Pair.of(input.first(), 0), Pair.of(input.second(), (V) null));
+ }
+ }, ptt);
+ PTable<Pair<K, Integer>, Pair<U, V>> tag2 = right.parallelDo("joinTagRight",
+ new MapFn<Pair<K, V>, Pair<Pair<K, Integer>, Pair<U, V>>>() {
+ @Override
+ public Pair<Pair<K, Integer>, Pair<U, V>> map(Pair<K, V> input) {
+ return Pair.of(Pair.of(input.first(), 1), Pair.of((U) null, input.second()));
+ }
+ }, ptt);
+
+ GroupingOptions.Builder optionsBuilder = GroupingOptions.builder();
+ optionsBuilder.partitionerClass(JoinUtils.getPartitionerClass(ptf));
+
+ return (tag1.union(tag2)).groupByKey(optionsBuilder.build());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/PTables.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/PTables.java b/crunch-core/src/main/java/org/apache/crunch/lib/PTables.java
new file mode 100644
index 0000000..e907680
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/PTables.java
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import java.util.List;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PGroupedTable;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.fn.IdentityFn;
+import org.apache.crunch.types.PGroupedTableType;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Methods for performing common operations on PTables.
+ *
+ */
+public class PTables {
+
+ /**
+ * Convert the given {@code PCollection<Pair<K, V>>} to a {@code PTable<K, V>}.
+ * @param pcollect The {@code PCollection} to convert
+ * @return A {@code PTable} that contains the same data as the input {@code PCollection}
+ */
+ public static <K, V> PTable<K, V> asPTable(PCollection<Pair<K, V>> pcollect) {
+ PType<Pair<K, V>> pt = pcollect.getPType();
+ PTypeFamily ptf = pt.getFamily();
+ PTableType<K, V> ptt = ptf.tableOf(pt.getSubTypes().get(0), pt.getSubTypes().get(1));
+ DoFn<Pair<K, V>, Pair<K, V>> id = IdentityFn.getInstance();
+ return pcollect.parallelDo("asPTable", id, ptt);
+ }
+
+ /**
+ * Extract the keys from the given {@code PTable<K, V>} as a {@code PCollection<K>}.
+ * @param ptable The {@code PTable}
+ * @return A {@code PCollection<K>}
+ */
+ public static <K, V> PCollection<K> keys(PTable<K, V> ptable) {
+ return ptable.parallelDo("PTables.keys", new DoFn<Pair<K, V>, K>() {
+ @Override
+ public void process(Pair<K, V> input, Emitter<K> emitter) {
+ emitter.emit(input.first());
+ }
+ }, ptable.getKeyType());
+ }
+
+ /**
+ * Extract the values from the given {@code PTable<K, V>} as a {@code PCollection<V>}.
+ * @param ptable The {@code PTable}
+ * @return A {@code PCollection<V>}
+ */
+ public static <K, V> PCollection<V> values(PTable<K, V> ptable) {
+ return ptable.parallelDo("PTables.values", new DoFn<Pair<K, V>, V>() {
+ @Override
+ public void process(Pair<K, V> input, Emitter<V> emitter) {
+ emitter.emit(input.second());
+ }
+ }, ptable.getValueType());
+ }
+
+ /**
+ * Create a detached value for a table {@link Pair}.
+ *
+ * @param tableType The table type
+ * @param value The value from which a detached value is to be created
+ * @return The detached value
+ * @see PType#getDetachedValue(Object)
+ */
+ public static <K, V> Pair<K, V> getDetachedValue(PTableType<K, V> tableType, Pair<K, V> value) {
+ return Pair.of(tableType.getKeyType().getDetachedValue(value.first()), tableType.getValueType()
+ .getDetachedValue(value.second()));
+ }
+
+ /**
+ * Created a detached value for a {@link PGroupedTable} value.
+ *
+ *
+ * @param groupedTableType The grouped table type
+ * @param value The value from which a detached value is to be created
+ * @return The detached value
+ * @see PType#getDetachedValue(Object)
+ */
+ public static <K, V> Pair<K, Iterable<V>> getGroupedDetachedValue(
+ PGroupedTableType<K, V> groupedTableType, Pair<K, Iterable<V>> value) {
+
+ PTableType<K, V> tableType = groupedTableType.getTableType();
+ List<V> detachedIterable = Lists.newArrayList();
+ PType<V> valueType = tableType.getValueType();
+ for (V v : value.second()) {
+ detachedIterable.add(valueType.getDetachedValue(v));
+ }
+ return Pair.of(tableType.getKeyType().getDetachedValue(value.first()),
+ (Iterable<V>) detachedIterable);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/Sample.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/Sample.java b/crunch-core/src/main/java/org/apache/crunch/lib/Sample.java
new file mode 100644
index 0000000..5a66101
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/Sample.java
@@ -0,0 +1,217 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.lib.SampleUtils.ReservoirSampleFn;
+import org.apache.crunch.lib.SampleUtils.SampleFn;
+import org.apache.crunch.lib.SampleUtils.WRSCombineFn;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+
+/**
+ * Methods for performing random sampling in a distributed fashion, either by accepting each
+ * record in a {@code PCollection} with an independent probability in order to sample some
+ * fraction of the overall data set, or by using reservoir sampling in order to pull a uniform
+ * or weighted sample of fixed size from a {@code PCollection} of an unknown size. For more details
+ * on the reservoir sampling algorithms used by this library, see the A-ES algorithm described in
+ * <a href="http://arxiv.org/pdf/1012.0256.pdf">Efraimidis (2012)</a>.
+ */
+public class Sample {
+
+ /**
+ * Output records from the given {@code PCollection} with the given probability.
+ *
+ * @param input The {@code PCollection} to sample from
+ * @param probability The probability (0.0 < p %lt; 1.0)
+ * @return The output {@code PCollection} created from sampling
+ */
+ public static <S> PCollection<S> sample(PCollection<S> input, double probability) {
+ return sample(input, null, probability);
+ }
+
+ /**
+ * Output records from the given {@code PCollection} using a given seed. Useful for unit
+ * testing.
+ *
+ * @param input The {@code PCollection} to sample from
+ * @param seed The seed for the random number generator
+ * @param probability The probability (0.0 < p < 1.0)
+ * @return The output {@code PCollection} created from sampling
+ */
+ public static <S> PCollection<S> sample(PCollection<S> input, Long seed, double probability) {
+ String stageName = String.format("sample(%.2f)", probability);
+ return input.parallelDo(stageName, new SampleFn<S>(probability, seed), input.getPType());
+ }
+
+ /**
+ * A {@code PTable<K, V>} analogue of the {@code sample} function.
+ *
+ * @param input The {@code PTable} to sample from
+ * @param probability The probability (0.0 < p < 1.0)
+ * @return The output {@code PTable} created from sampling
+ */
+ public static <K, V> PTable<K, V> sample(PTable<K, V> input, double probability) {
+ return PTables.asPTable(sample((PCollection<Pair<K, V>>) input, probability));
+ }
+
+ /**
+ * A {@code PTable<K, V>} analogue of the {@code sample} function, with the seed argument
+ * exposed for testing purposes.
+ *
+ * @param input The {@code PTable} to sample from
+ * @param seed The seed for the random number generator
+ * @param probability The probability (0.0 < p < 1.0)
+ * @return The output {@code PTable} created from sampling
+ */
+ public static <K, V> PTable<K, V> sample(PTable<K, V> input, Long seed, double probability) {
+ return PTables.asPTable(sample((PCollection<Pair<K, V>>) input, seed, probability));
+ }
+
+ /**
+ * Select a fixed number of elements from the given {@code PCollection} with each element
+ * equally likely to be included in the sample.
+ *
+ * @param input The input data
+ * @param sampleSize The number of elements to select
+ * @return A {@code PCollection} made up of the sampled elements
+ */
+ public static <T> PCollection<T> reservoirSample(
+ PCollection<T> input,
+ int sampleSize) {
+ return reservorSample(input, sampleSize, null);
+ }
+
+ /**
+ * A version of the reservoir sampling algorithm that uses a given seed, primarily for
+ * testing purposes.
+ *
+ * @param input The input data
+ * @param sampleSize The number of elements to select
+ * @param seed The test seed
+ * @return A {@code PCollection} made up of the sampled elements
+
+ */
+ public static <T> PCollection<T> reservorSample(
+ PCollection<T> input,
+ int sampleSize,
+ Long seed) {
+ PTypeFamily ptf = input.getTypeFamily();
+ PType<Pair<T, Integer>> ptype = ptf.pairs(input.getPType(), ptf.ints());
+ return weightedReservoirSample(
+ input.parallelDo(new MapFn<T, Pair<T, Integer>>() {
+ public Pair<T, Integer> map(T t) { return Pair.of(t, 1); }
+ }, ptype),
+ sampleSize,
+ seed);
+ }
+
+ /**
+ * Selects a weighted sample of the elements of the given {@code PCollection}, where the second term in
+ * the input {@code Pair} is a numerical weight.
+ *
+ * @param input the weighted observations
+ * @param sampleSize The number of elements to select
+ * @return A random sample of the given size that respects the weighting values
+ */
+ public static <T, N extends Number> PCollection<T> weightedReservoirSample(
+ PCollection<Pair<T, N>> input,
+ int sampleSize) {
+ return weightedReservoirSample(input, sampleSize, null);
+ }
+
+ /**
+ * The weighted reservoir sampling function with the seed term exposed for testing purposes.
+ *
+ * @param input the weighted observations
+ * @param sampleSize The number of elements to select
+ * @param seed The test seed
+ * @return A random sample of the given size that respects the weighting values
+ */
+ public static <T, N extends Number> PCollection<T> weightedReservoirSample(
+ PCollection<Pair<T, N>> input,
+ int sampleSize,
+ Long seed) {
+ PTypeFamily ptf = input.getTypeFamily();
+ PTable<Integer, Pair<T, N>> groupedIn = input.parallelDo(
+ new MapFn<Pair<T, N>, Pair<Integer, Pair<T, N>>>() {
+ @Override
+ public Pair<Integer, Pair<T, N>> map(Pair<T, N> p) {
+ return Pair.of(0, p);
+ }
+ }, ptf.tableOf(ptf.ints(), input.getPType()));
+ int[] ss = new int[] { sampleSize };
+ return groupedWeightedReservoirSample(groupedIn, ss, seed)
+ .parallelDo(new MapFn<Pair<Integer, T>, T>() {
+ @Override
+ public T map(Pair<Integer, T> p) {
+ return p.second();
+ }
+ }, (PType<T>) input.getPType().getSubTypes().get(0));
+ }
+
+ /**
+ * The most general purpose of the weighted reservoir sampling patterns that allows us to choose
+ * a random sample of elements for each of N input groups.
+ *
+ * @param input A {@code PTable} with the key a group ID and the value a weighted observation in that group
+ * @param sampleSizes An array of length N, with each entry is the number of elements to include in that group
+ * @return A {@code PCollection} of the sampled elements for each of the groups
+ */
+
+ public static <T, N extends Number> PCollection<Pair<Integer, T>> groupedWeightedReservoirSample(
+ PTable<Integer, Pair<T, N>> input,
+ int[] sampleSizes) {
+ return groupedWeightedReservoirSample(input, sampleSizes, null);
+ }
+
+ /**
+ * Same as the other groupedWeightedReservoirSample method, but include a seed for testing
+ * purposes.
+ *
+ * @param input A {@code PTable} with the key a group ID and the value a weighted observation in that group
+ * @param sampleSizes An array of length N, with each entry is the number of elements to include in that group
+ * @param seed The test seed
+ * @return A {@code PCollection} of the sampled elements for each of the groups
+ */
+ public static <T, N extends Number> PCollection<Pair<Integer, T>> groupedWeightedReservoirSample(
+ PTable<Integer, Pair<T, N>> input,
+ int[] sampleSizes,
+ Long seed) {
+ PTypeFamily ptf = input.getTypeFamily();
+ PType<T> ttype = (PType<T>) input.getPTableType().getValueType().getSubTypes().get(0);
+ PTableType<Integer, Pair<Double, T>> ptt = ptf.tableOf(ptf.ints(),
+ ptf.pairs(ptf.doubles(), ttype));
+
+ return input.parallelDo(new ReservoirSampleFn<T, N>(sampleSizes, seed, ttype), ptt)
+ .groupByKey(1)
+ .combineValues(new WRSCombineFn<T>(sampleSizes, ttype))
+ .parallelDo(new MapFn<Pair<Integer, Pair<Double, T>>, Pair<Integer, T>>() {
+ @Override
+ public Pair<Integer, T> map(Pair<Integer, Pair<Double, T>> p) {
+ return Pair.of(p.first(), p.second().second());
+ }
+ }, ptf.pairs(ptf.ints(), ttype));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/SampleUtils.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/SampleUtils.java b/crunch-core/src/main/java/org/apache/crunch/lib/SampleUtils.java
new file mode 100644
index 0000000..8769eed
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/SampleUtils.java
@@ -0,0 +1,168 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.SortedMap;
+
+import org.apache.crunch.CombineFn;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.FilterFn;
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.PType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+class SampleUtils {
+
+ static class SampleFn<S> extends FilterFn<S> {
+
+ private final Long seed;
+ private final double acceptanceProbability;
+ private transient Random r;
+
+ public SampleFn(double acceptanceProbability, Long seed) {
+ Preconditions.checkArgument(0.0 < acceptanceProbability && acceptanceProbability < 1.0);
+ this.seed = seed == null ? System.currentTimeMillis() : seed;
+ this.acceptanceProbability = acceptanceProbability;
+ }
+
+ @Override
+ public void initialize() {
+ if (r == null) {
+ r = new Random(seed);
+ }
+ }
+
+ @Override
+ public boolean accept(S input) {
+ return r.nextDouble() < acceptanceProbability;
+ }
+ }
+
+
+ static class ReservoirSampleFn<T, N extends Number>
+ extends DoFn<Pair<Integer, Pair<T, N>>, Pair<Integer, Pair<Double, T>>> {
+
+ private int[] sampleSizes;
+ private Long seed;
+ private PType<T> valueType;
+ private transient List<SortedMap<Double, T>> reservoirs;
+ private transient Random random;
+
+ public ReservoirSampleFn(int[] sampleSizes, Long seed, PType<T> valueType) {
+ this.sampleSizes = sampleSizes;
+ this.seed = seed;
+ this.valueType = valueType;
+ }
+
+ @Override
+ public void initialize() {
+ this.reservoirs = Lists.newArrayList();
+ this.valueType.initialize(getConfiguration());
+ for (int i = 0; i < sampleSizes.length; i++) {
+ reservoirs.add(Maps.<Double, T>newTreeMap());
+ }
+ if (random == null) {
+ if (seed == null) {
+ this.random = new Random();
+ } else {
+ this.random = new Random(seed);
+ }
+ }
+ }
+
+ @Override
+ public void process(Pair<Integer, Pair<T, N>> input,
+ Emitter<Pair<Integer, Pair<Double, T>>> emitter) {
+ int id = input.first();
+ Pair<T, N> p = input.second();
+ double weight = p.second().doubleValue();
+ if (weight > 0.0) {
+ double score = Math.log(random.nextDouble()) / weight;
+ SortedMap<Double, T> reservoir = reservoirs.get(id);
+ if (reservoir.size() < sampleSizes[id]) {
+ reservoir.put(score, valueType.getDetachedValue(p.first()));
+ } else if (score > reservoir.firstKey()) {
+ reservoir.remove(reservoir.firstKey());
+ reservoir.put(score, valueType.getDetachedValue(p.first()));
+ }
+ }
+ }
+
+ @Override
+ public void cleanup(Emitter<Pair<Integer, Pair<Double, T>>> emitter) {
+ for (int id = 0; id < reservoirs.size(); id++) {
+ SortedMap<Double, T> reservoir = reservoirs.get(id);
+ for (Map.Entry<Double, T> e : reservoir.entrySet()) {
+ emitter.emit(Pair.of(id, Pair.of(e.getKey(), e.getValue())));
+ }
+ }
+ }
+ }
+
+ static class WRSCombineFn<T> extends CombineFn<Integer, Pair<Double, T>> {
+
+ private int[] sampleSizes;
+ private PType<T> valueType;
+ private List<SortedMap<Double, T>> reservoirs;
+
+ public WRSCombineFn(int[] sampleSizes, PType<T> valueType) {
+ this.sampleSizes = sampleSizes;
+ this.valueType = valueType;
+ }
+
+ @Override
+ public void initialize() {
+ this.reservoirs = Lists.newArrayList();
+ for (int i = 0; i < sampleSizes.length; i++) {
+ reservoirs.add(Maps.<Double, T>newTreeMap());
+ }
+ this.valueType.initialize(getConfiguration());
+ }
+
+ @Override
+ public void process(Pair<Integer, Iterable<Pair<Double, T>>> input,
+ Emitter<Pair<Integer, Pair<Double, T>>> emitter) {
+ SortedMap<Double, T> reservoir = reservoirs.get(input.first());
+ for (Pair<Double, T> p : input.second()) {
+ if (reservoir.size() < sampleSizes[input.first()]) {
+ reservoir.put(p.first(), valueType.getDetachedValue(p.second()));
+ } else if (p.first() > reservoir.firstKey()) {
+ reservoir.remove(reservoir.firstKey());
+ reservoir.put(p.first(), valueType.getDetachedValue(p.second()));
+ }
+ }
+ }
+
+ @Override
+ public void cleanup(Emitter<Pair<Integer, Pair<Double, T>>> emitter) {
+ for (int i = 0; i < reservoirs.size(); i++) {
+ SortedMap<Double, T> reservoir = reservoirs.get(i);
+ for (Map.Entry<Double, T> e : reservoir.entrySet()) {
+ emitter.emit(Pair.of(i, Pair.of(e.getKey(), e.getValue())));
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/SecondarySort.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/SecondarySort.java b/crunch-core/src/main/java/org/apache/crunch/lib/SecondarySort.java
new file mode 100644
index 0000000..54b4396
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/SecondarySort.java
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import java.util.Collection;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.GroupingOptions;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PGroupedTable;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.lib.join.JoinUtils;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Utilities for performing a secondary sort on a {@code PTable<K, Pair<V1, V2>>} collection.
+ * <p>
+ * Secondary sorts are usually performed during sessionization: given a collection
+ * of events, we want to group them by a key (such as a user ID), then sort the grouped
+ * records by an auxillary key (such as a timestamp), and then perform some additional
+ * processing on the sorted records.
+ */
+public class SecondarySort {
+
+ /**
+ * Perform a secondary sort on the given {@code PTable} instance and then apply a
+ * {@code DoFn} to the resulting sorted data to yield an output {@code PCollection<T>}.
+ */
+ public static <K, V1, V2, T> PCollection<T> sortAndApply(PTable<K, Pair<V1, V2>> input,
+ DoFn<Pair<K, Iterable<Pair<V1, V2>>>, T> doFn, PType<T> ptype) {
+ return prepare(input)
+ .parallelDo("SecondarySort.apply", new SSWrapFn<K, V1, V2, T>(doFn), ptype);
+ }
+
+ /**
+ * Perform a secondary sort on the given {@code PTable} instance and then apply a
+ * {@code DoFn} to the resulting sorted data to yield an output {@code PTable<U, V>}.
+ */
+ public static <K, V1, V2, U, V> PTable<U, V> sortAndApply(PTable<K, Pair<V1, V2>> input,
+ DoFn<Pair<K, Iterable<Pair<V1, V2>>>, Pair<U, V>> doFn, PTableType<U, V> ptype) {
+ return prepare(input)
+ .parallelDo("SecondarySort.apply", new SSWrapFn<K, V1, V2, Pair<U, V>>(doFn), ptype);
+ }
+
+ private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare(
+ PTable<K, Pair<V1, V2>> input) {
+ PTypeFamily ptf = input.getTypeFamily();
+ PType<Pair<V1, V2>> valueType = input.getValueType();
+ PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf(
+ ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)),
+ valueType);
+ PTableType<K, Collection<Pair<V1, V2>>> out = ptf.tableOf(input.getKeyType(),
+ ptf.collections(input.getValueType()));
+ return input.parallelDo("SecondarySort.format", new SSFormatFn<K, V1, V2>(), inter)
+ .groupByKey(
+ GroupingOptions.builder()
+ .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf))
+ .partitionerClass(JoinUtils.getPartitionerClass(ptf))
+ .build());
+ }
+
+ private static class SSFormatFn<K, V1, V2> extends MapFn<Pair<K, Pair<V1, V2>>, Pair<Pair<K, V1>, Pair<V1, V2>>> {
+ @Override
+ public Pair<Pair<K, V1>, Pair<V1, V2>> map(Pair<K, Pair<V1, V2>> input) {
+ return Pair.of(Pair.of(input.first(), input.second().first()), input.second());
+ }
+ }
+
+ private static class SSWrapFn<K, V1, V2, T> extends DoFn<Pair<Pair<K, V1>, Iterable<Pair<V1, V2>>>, T> {
+ private final DoFn<Pair<K, Iterable<Pair<V1, V2>>>, T> intern;
+
+ public SSWrapFn(DoFn<Pair<K, Iterable<Pair<V1, V2>>>, T> intern) {
+ this.intern = intern;
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ intern.configure(conf);
+ }
+
+ @Override
+ public void initialize() {
+ intern.setContext(getContext());
+ intern.initialize();
+ }
+
+ @Override
+ public void process(Pair<Pair<K, V1>, Iterable<Pair<V1, V2>>> input, Emitter<T> emitter) {
+ intern.process(Pair.of(input.first().first(), input.second()), emitter);
+ }
+
+ @Override
+ public void cleanup(Emitter<T> emitter) {
+ intern.cleanup(emitter);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/Set.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/Set.java b/crunch-core/src/main/java/org/apache/crunch/lib/Set.java
new file mode 100644
index 0000000..0ba879c
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/Set.java
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import java.util.Collection;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+
+/**
+ * Utilities for performing set operations (difference, intersection, etc) on
+ * {@code PCollection} instances.
+ */
+public class Set {
+
+ /**
+ * Compute the set difference between two sets of elements.
+ *
+ * @return a collection containing elements that are in <code>coll1</code> but
+ * not in <code>coll2</code>
+ */
+ public static <T> PCollection<T> difference(PCollection<T> coll1, PCollection<T> coll2) {
+ return Cogroup.cogroup(toTable(coll1), toTable(coll2)).parallelDo(
+ new DoFn<Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>>, T>() {
+ @Override
+ public void process(Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>> input, Emitter<T> emitter) {
+ Pair<Collection<Boolean>, Collection<Boolean>> groups = input.second();
+ if (!groups.first().isEmpty() && groups.second().isEmpty()) {
+ emitter.emit(input.first());
+ }
+ }
+ }, coll1.getPType());
+ }
+
+ /**
+ * Compute the intersection of two sets of elements.
+ *
+ * @return a collection containing elements that common to both sets
+ * <code>coll1</code> and <code>coll2</code>
+ */
+ public static <T> PCollection<T> intersection(PCollection<T> coll1, PCollection<T> coll2) {
+ return Cogroup.cogroup(toTable(coll1), toTable(coll2)).parallelDo(
+ new DoFn<Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>>, T>() {
+ @Override
+ public void process(Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>> input, Emitter<T> emitter) {
+ Pair<Collection<Boolean>, Collection<Boolean>> groups = input.second();
+ if (!groups.first().isEmpty() && !groups.second().isEmpty()) {
+ emitter.emit(input.first());
+ }
+ }
+ }, coll1.getPType());
+ }
+
+ /**
+ * Find the elements that are common to two sets, like the Unix
+ * <code>comm</code> utility. This method returns a {@link PCollection} of
+ * {@link Tuple3} objects, and the position in the tuple that an element
+ * appears is determined by the collections that it is a member of, as
+ * follows:
+ * <ol>
+ * <li>elements only in <code>coll1</code>,</li>
+ * <li>elements only in <code>coll2</code>, or</li>
+ * <li>elements in both collections</li>
+ * </ol>
+ * Tuples are otherwise filled with <code>null</code>.
+ *
+ * @return a collection of {@link Tuple3} objects
+ */
+ public static <T> PCollection<Tuple3<T, T, T>> comm(PCollection<T> coll1, PCollection<T> coll2) {
+ PTypeFamily typeFamily = coll1.getTypeFamily();
+ PType<T> type = coll1.getPType();
+ return Cogroup.cogroup(toTable(coll1), toTable(coll2)).parallelDo(
+ new DoFn<Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>>, Tuple3<T, T, T>>() {
+ @Override
+ public void process(Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>> input,
+ Emitter<Tuple3<T, T, T>> emitter) {
+ Pair<Collection<Boolean>, Collection<Boolean>> groups = input.second();
+ boolean inFirst = !groups.first().isEmpty();
+ boolean inSecond = !groups.second().isEmpty();
+ T t = input.first();
+ emitter.emit(Tuple3.of(inFirst && !inSecond ? t : null, !inFirst && inSecond ? t : null, inFirst
+ && inSecond ? t : null));
+ }
+ }, typeFamily.triples(type, type, type));
+ }
+
+ private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) {
+ PTypeFamily typeFamily = coll.getTypeFamily();
+ return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() {
+ @Override
+ public void process(T input, Emitter<Pair<T, Boolean>> emitter) {
+ emitter.emit(Pair.of(input, Boolean.TRUE));
+ }
+ }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans()));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/Sort.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/Sort.java b/crunch-core/src/main/java/org/apache/crunch/lib/Sort.java
new file mode 100644
index 0000000..23bcaee
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/Sort.java
@@ -0,0 +1,294 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static org.apache.crunch.lib.sort.Comparators.*;
+import static org.apache.crunch.lib.sort.SortFns.*;
+
+import org.apache.avro.Schema;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.GroupingOptions;
+import org.apache.crunch.GroupingOptions.Builder;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.Tuple;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+import org.apache.crunch.lib.sort.TotalOrderPartitioner;
+import org.apache.crunch.materialize.MaterializableIterable;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroType;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.apache.crunch.util.PartitionUtils;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Utilities for sorting {@code PCollection} instances.
+ */
+public class Sort {
+
+ /**
+ * For signaling the order in which a sort should be done.
+ */
+ public enum Order {
+ ASCENDING,
+ DESCENDING,
+ IGNORE
+ }
+
+ /**
+ * To sort by column 2 ascending then column 1 descending, you would use:
+ * <code>
+ * sortPairs(coll, by(2, ASCENDING), by(1, DESCENDING))
+ * </code> Column numbering is 1-based.
+ */
+ public static class ColumnOrder {
+ private int column;
+ private Order order;
+
+ public ColumnOrder(int column, Order order) {
+ this.column = column;
+ this.order = order;
+ }
+
+ public static ColumnOrder by(int column, Order order) {
+ return new ColumnOrder(column, order);
+ }
+
+ public int column() {
+ return column;
+ }
+
+ public Order order() {
+ return order;
+ }
+
+ @Override
+ public String toString() {
+ return "ColumnOrder: column:" + column + ", Order: " + order;
+ }
+ }
+
+ /**
+ * Sorts the {@code PCollection} using the natural ordering of its elements in ascending order.
+ *
+ * @return a {@code PCollection} representing the sorted collection.
+ */
+ public static <T> PCollection<T> sort(PCollection<T> collection) {
+ return sort(collection, Order.ASCENDING);
+ }
+
+ /**
+ * Sorts the {@code PCollection} using the natural order of its elements with the given {@code Order}.
+ *
+ * @return a {@code PCollection} representing the sorted collection.
+ */
+ public static <T> PCollection<T> sort(PCollection<T> collection, Order order) {
+ return sort(collection, -1, order);
+ }
+
+ /**
+ * Sorts the {@code PCollection} using the natural ordering of its elements in
+ * the order specified using the given number of reducers.
+ *
+ * @return a {@code PCollection} representing the sorted collection.
+ */
+ public static <T> PCollection<T> sort(PCollection<T> collection, int numReducers, Order order) {
+ PTypeFamily tf = collection.getTypeFamily();
+ PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls());
+ Configuration conf = collection.getPipeline().getConfiguration();
+ PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() {
+ @Override
+ public void process(T input, Emitter<Pair<T, Void>> emitter) {
+ emitter.emit(Pair.of(input, (Void) null));
+ }
+ }, type);
+ GroupingOptions options = buildGroupingOptions(pt, conf, numReducers, order);
+ return pt.groupByKey(options).ungroup().keys();
+ }
+
+ /**
+ * Sorts the {@code PTable} using the natural ordering of its keys in ascending order.
+ *
+ * @return a {@code PTable} representing the sorted table.
+ */
+ public static <K, V> PTable<K, V> sort(PTable<K, V> table) {
+ return sort(table, Order.ASCENDING);
+ }
+
+ /**
+ * Sorts the {@code PTable} using the natural ordering of its keys with the given {@code Order}.
+ *
+ * @return a {@code PTable} representing the sorted table.
+ */
+ public static <K, V> PTable<K, V> sort(PTable<K, V> table, Order key) {
+ return sort(table, -1, key);
+ }
+
+ /**
+ * Sorts the {@code PTable} using the natural ordering of its keys in the
+ * order specified with a client-specified number of reducers.
+ *
+ * @return a {@code PTable} representing the sorted collection.
+ */
+ public static <K, V> PTable<K, V> sort(PTable<K, V> table, int numReducers, Order key) {
+ Configuration conf = table.getPipeline().getConfiguration();
+ GroupingOptions options = buildGroupingOptions(table, conf, numReducers, key);
+ return table.groupByKey(options).ungroup();
+ }
+
+
+ /**
+ * Sorts the {@code PCollection} of {@code Pair}s using the specified column
+ * ordering.
+ *
+ * @return a {@code PCollection} representing the sorted collection.
+ */
+ public static <U, V> PCollection<Pair<U, V>> sortPairs(PCollection<Pair<U, V>> collection,
+ ColumnOrder... columnOrders) {
+ return sortTuples(collection, columnOrders);
+ }
+
+ /**
+ * Sorts the {@code PCollection} of {@code Tuple3}s using the specified column
+ * ordering.
+ *
+ * @return a {@code PCollection} representing the sorted collection.
+ */
+ public static <V1, V2, V3> PCollection<Tuple3<V1, V2, V3>> sortTriples(PCollection<Tuple3<V1, V2, V3>> collection,
+ ColumnOrder... columnOrders) {
+ return sortTuples(collection, columnOrders);
+ }
+
+ /**
+ * Sorts the {@code PCollection} of {@code Tuple4}s using the specified column
+ * ordering.
+ *
+ * @return a {@code PCollection} representing the sorted collection.
+ */
+ public static <V1, V2, V3, V4> PCollection<Tuple4<V1, V2, V3, V4>> sortQuads(
+ PCollection<Tuple4<V1, V2, V3, V4>> collection, ColumnOrder... columnOrders) {
+ return sortTuples(collection, columnOrders);
+ }
+
+ /**
+ * Sorts the {@code PCollection} of tuples using the specified column ordering.
+ *
+ * @return a {@code PCollection} representing the sorted collection.
+ */
+ public static <T extends Tuple> PCollection<T> sortTuples(PCollection<T> collection,
+ ColumnOrder... columnOrders) {
+ return sortTuples(collection, -1, columnOrders);
+ }
+
+ /**
+ * Sorts the {@code PCollection} of {@link TupleN}s using the specified column
+ * ordering and a client-specified number of reducers.
+ *
+ * @return a {@code PCollection} representing the sorted collection.
+ */
+ public static <T extends Tuple> PCollection<T> sortTuples(PCollection<T> collection, int numReducers,
+ ColumnOrder... columnOrders) {
+ PType<T> pType = collection.getPType();
+ KeyExtraction<T> ke = new KeyExtraction<T>(pType, columnOrders);
+ PTable<Object, T> pt = collection.by(ke.getByFn(), ke.getKeyType());
+ Configuration conf = collection.getPipeline().getConfiguration();
+ GroupingOptions options = buildGroupingOptions(pt, conf, numReducers, columnOrders);
+ return pt.groupByKey(options).ungroup().values();
+ }
+
+ // TODO: move to type family?
+ private static <K, V> GroupingOptions buildGroupingOptions(PTable<K, V> ptable, Configuration conf,
+ int numReducers, Order order) {
+ PType<K> ptype = ptable.getKeyType();
+ PTypeFamily tf = ptable.getTypeFamily();
+ Builder builder = GroupingOptions.builder();
+ if (order == Order.DESCENDING) {
+ if (tf == WritableTypeFamily.getInstance()) {
+ builder.sortComparatorClass(ReverseWritableComparator.class);
+ } else if (tf == AvroTypeFamily.getInstance()) {
+ AvroType<K> avroType = (AvroType<K>) ptype;
+ Schema schema = avroType.getSchema();
+ builder.conf("crunch.schema", schema.toString());
+ builder.sortComparatorClass(ReverseAvroComparator.class);
+ } else {
+ throw new RuntimeException("Unrecognized type family: " + tf);
+ }
+ } else if (tf == AvroTypeFamily.getInstance()) {
+ builder.conf("crunch.schema", ((AvroType<K>) ptype).getSchema().toString());
+ }
+ configureReducers(builder, ptable, conf, numReducers);
+ return builder.build();
+ }
+
+ private static <K, V> GroupingOptions buildGroupingOptions(PTable<K, V> ptable, Configuration conf,
+ int numReducers, ColumnOrder[] columnOrders) {
+ PTypeFamily tf = ptable.getTypeFamily();
+ PType<K> keyType = ptable.getKeyType();
+ Builder builder = GroupingOptions.builder();
+ if (tf == WritableTypeFamily.getInstance()) {
+ if (columnOrders.length == 1 && columnOrders[0].order == Order.DESCENDING) {
+ builder.sortComparatorClass(ReverseWritableComparator.class);
+ } else {
+ TupleWritableComparator.configureOrdering(conf, columnOrders);
+ builder.sortComparatorClass(TupleWritableComparator.class);
+ }
+ } else if (tf == AvroTypeFamily.getInstance()) {
+ AvroType<K> avroType = (AvroType<K>) keyType;
+ Schema schema = avroType.getSchema();
+ builder.conf("crunch.schema", schema.toString());
+ if (columnOrders.length == 1 && columnOrders[0].order == Order.DESCENDING) {
+ builder.sortComparatorClass(ReverseAvroComparator.class);
+ }
+ } else {
+ throw new RuntimeException("Unrecognized type family: " + tf);
+ }
+ configureReducers(builder, ptable, conf, numReducers);
+ return builder.build();
+ }
+
+ private static <K, V> void configureReducers(GroupingOptions.Builder builder,
+ PTable<K, V> ptable, Configuration conf, int numReducers) {
+ if (numReducers <= 0) {
+ numReducers = PartitionUtils.getRecommendedPartitions(ptable, conf);
+ if (numReducers < 5) {
+ // Not worth the overhead, force it to 1
+ numReducers = 1;
+ }
+ }
+ builder.numReducers(numReducers);
+ if (numReducers > 1) {
+ Iterable<K> iter = Sample.reservoirSample(ptable.keys(), numReducers - 1).materialize();
+ MaterializableIterable<K> mi = (MaterializableIterable<K>) iter;
+ if (mi.isSourceTarget()) {
+ builder.sourceTarget((SourceTarget) mi.getSource());
+ }
+ builder.partitionerClass(TotalOrderPartitioner.class);
+ builder.conf(TotalOrderPartitioner.PARTITIONER_PATH, mi.getPath().toString());
+ //TODO: distcache handling
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/lib/join/FullOuterJoinFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/lib/join/FullOuterJoinFn.java b/crunch-core/src/main/java/org/apache/crunch/lib/join/FullOuterJoinFn.java
new file mode 100644
index 0000000..c0ce727
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/lib/join/FullOuterJoinFn.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import java.util.List;
+
+import org.apache.crunch.Emitter;
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.PType;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Used to perform the last step of an full outer join.
+ *
+ * @param <K> Type of the keys.
+ * @param <U> Type of the first {@link org.apache.crunch.PTable}'s values
+ * @param <V> Type of the second {@link org.apache.crunch.PTable}'s values
+ */
+public class FullOuterJoinFn<K, U, V> extends JoinFn<K, U, V> {
+
+ private transient int lastId;
+ private transient K lastKey;
+ private transient List<U> leftValues;
+
+ public FullOuterJoinFn(PType<K> keyType, PType<U> leftValueType) {
+ super(keyType, leftValueType);
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public void initialize() {
+ super.initialize();
+ lastId = 1;
+ lastKey = null;
+ this.leftValues = Lists.newArrayList();
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public void join(K key, int id, Iterable<Pair<U, V>> pairs, Emitter<Pair<K, Pair<U, V>>> emitter) {
+ if (!key.equals(lastKey)) {
+ // Make sure that left side gets emitted.
+ if (0 == lastId) {
+ for (U u : leftValues) {
+ emitter.emit(Pair.of(lastKey, Pair.of(u, (V) null)));
+ }
+ }
+ lastKey = keyType.getDetachedValue(key);
+ leftValues.clear();
+ }
+ if (id == 0) {
+ for (Pair<U, V> pair : pairs) {
+ if (pair.first() != null)
+ leftValues.add(leftValueType.getDetachedValue(pair.first()));
+ }
+ } else {
+ for (Pair<U, V> pair : pairs) {
+ // Make sure that right side gets emitted.
+ if (leftValues.isEmpty()) {
+ leftValues.add(null);
+ }
+ for (U u : leftValues) {
+ emitter.emit(Pair.of(lastKey, Pair.of(u, pair.second())));
+ }
+ }
+ }
+
+ lastId = id;
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public void cleanup(Emitter<Pair<K, Pair<U, V>>> emitter) {
+ if (0 == lastId) {
+ for (U u : leftValues) {
+ emitter.emit(Pair.of(lastKey, Pair.of(u, (V) null)));
+ }
+ }
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public String getJoinType() {
+ return "fullOuterJoin";
+ }
+}
[40/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/SpecificAvroGroupByIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/SpecificAvroGroupByIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/SpecificAvroGroupByIT.java
new file mode 100644
index 0000000..5292353
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/SpecificAvroGroupByIT.java
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.List;
+
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.specific.SpecificDatumWriter;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.At;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.avro.Avros;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Test {@link org.apache.crunch.types.avro.SafeAvroSerialization} with Specific Avro types
+ */
+public class SpecificAvroGroupByIT implements Serializable {
+
+ private static final long serialVersionUID = 1344118240353796561L;
+
+ private transient File avroFile;
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+
+ @Before
+ public void setUp() throws IOException {
+ avroFile = File.createTempFile("avrotest", ".avro");
+ }
+
+ @After
+ public void tearDown() {
+ avroFile.delete();
+ }
+
+ @Test
+ public void testGrouByWithSpecificAvroType() throws Exception {
+ MRPipeline pipeline = new MRPipeline(SpecificAvroGroupByIT.class, tmpDir.getDefaultConfiguration());
+ testSpecificAvro(pipeline);
+ }
+
+ public void testSpecificAvro(MRPipeline pipeline) throws Exception {
+
+ createPersonAvroFile(avroFile);
+
+ PCollection<Person> unsorted = pipeline.read(At.avroFile(avroFile.getAbsolutePath(), Avros.records(Person.class)));
+
+ PTable<String, Person> sorted = unsorted.parallelDo(new MapFn<Person, Pair<String, Person>>() {
+
+ @Override
+ public Pair<String, Person> map(Person input) {
+ String key = input.name.toString();
+ return Pair.of(key, input);
+
+ }
+ }, Avros.tableOf(Avros.strings(), Avros.records(Person.class))).groupByKey().ungroup();
+
+ List<Pair<String, Person>> outputPersonList = Lists.newArrayList(sorted.materialize());
+
+ assertEquals(1, outputPersonList.size());
+ assertEquals(String.class, outputPersonList.get(0).first().getClass());
+ assertEquals(Person.class, outputPersonList.get(0).second().getClass());
+
+ pipeline.done();
+ }
+
+ private void createPersonAvroFile(File avroFile) throws IOException {
+
+ Person person = new Person();
+ person.age = 40;
+ person.name = "Bob";
+ List<CharSequence> siblingNames = Lists.newArrayList();
+ siblingNames.add("Bob" + "1");
+ siblingNames.add("Bob" + "2");
+ person.siblingnames = siblingNames;
+
+ FileOutputStream outputStream = new FileOutputStream(avroFile);
+ SpecificDatumWriter<Person> writer = new SpecificDatumWriter<Person>(Person.class);
+
+ DataFileWriter<Person> dataFileWriter = new DataFileWriter<Person>(writer);
+ dataFileWriter.create(Person.SCHEMA$, outputStream);
+ dataFileWriter.append(person);
+ dataFileWriter.close();
+ outputStream.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/join/FullOuterJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/FullOuterJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/FullOuterJoinIT.java
new file mode 100644
index 0000000..63d594d
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/FullOuterJoinIT.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import static org.junit.Assert.assertTrue;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.PTypeFamily;
+
+public class FullOuterJoinIT extends JoinTester {
+ @Override
+ public void assertPassed(Iterable<Pair<String, Long>> lines) {
+ boolean passed1 = false;
+ boolean passed2 = false;
+ boolean passed3 = false;
+ for (Pair<String, Long> line : lines) {
+ if ("wretched".equals(line.first()) && 24 == line.second()) {
+ passed1 = true;
+ }
+ if ("againe".equals(line.first()) && 10 == line.second()) {
+ passed2 = true;
+ }
+ if ("Montparnasse.".equals(line.first()) && 2 == line.second()) {
+ passed3 = true;
+ }
+ }
+ assertTrue(passed1);
+ assertTrue(passed2);
+ assertTrue(passed3);
+ }
+
+ @Override
+ protected JoinFn<String, Long, Long> getJoinFn(PTypeFamily typeFamily) {
+ return new FullOuterJoinFn<String, Long, Long>(typeFamily.strings(), typeFamily.longs());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/join/InnerJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/InnerJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/InnerJoinIT.java
new file mode 100644
index 0000000..4759050
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/InnerJoinIT.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import static org.junit.Assert.assertTrue;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.PTypeFamily;
+
+public class InnerJoinIT extends JoinTester {
+ @Override
+ public void assertPassed(Iterable<Pair<String, Long>> lines) {
+ boolean passed1 = false;
+ boolean passed2 = true;
+ boolean passed3 = true;
+ for (Pair<String, Long> line : lines) {
+ if ("wretched".equals(line.first()) && 24 == line.second()) {
+ passed1 = true;
+ }
+ if ("againe".equals(line.first())) {
+ passed2 = false;
+ }
+ if ("Montparnasse.".equals(line.first())) {
+ passed3 = false;
+ }
+ }
+ assertTrue(passed1);
+ assertTrue(passed2);
+ assertTrue(passed3);
+ }
+
+ @Override
+ protected JoinFn<String, Long, Long> getJoinFn(PTypeFamily typeFamily) {
+ return new InnerJoinFn<String, Long, Long>(typeFamily.strings(), typeFamily.longs());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java
new file mode 100644
index 0000000..3e8ffda
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/JoinTester.java
@@ -0,0 +1,108 @@
+/**
+R * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import java.io.IOException;
+import java.io.Serializable;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.lib.Aggregate;
+import org.apache.crunch.lib.Join;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.junit.Rule;
+import org.junit.Test;
+
+public abstract class JoinTester implements Serializable {
+ private static class WordSplit extends DoFn<String, String> {
+ @Override
+ public void process(String input, Emitter<String> emitter) {
+ for (String word : input.split("\\s+")) {
+ emitter.emit(word);
+ }
+ }
+ }
+
+ protected PTable<String, Long> join(PCollection<String> w1, PCollection<String> w2, PTypeFamily ptf) {
+ PTableType<String, Long> ntt = ptf.tableOf(ptf.strings(), ptf.longs());
+ PTable<String, Long> ws1 = Aggregate.count(w1.parallelDo("ws1", new WordSplit(), ptf.strings()));
+ PTable<String, Long> ws2 = Aggregate.count(w2.parallelDo("ws2", new WordSplit(), ptf.strings()));
+
+ PTable<String, Pair<Long, Long>> join = Join.join(ws1, ws2, getJoinFn(ptf));
+
+ PTable<String, Long> sums = join.parallelDo("cnt", new DoFn<Pair<String, Pair<Long, Long>>, Pair<String, Long>>() {
+ @Override
+ public void process(Pair<String, Pair<Long, Long>> input, Emitter<Pair<String, Long>> emitter) {
+ Pair<Long, Long> pair = input.second();
+ long sum = (pair.first() != null ? pair.first() : 0) + (pair.second() != null ? pair.second() : 0);
+ emitter.emit(Pair.of(input.first(), sum));
+ }
+ }, ntt);
+
+ return sums;
+ }
+
+ protected void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
+ String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
+ String maughamInputPath = tmpDir.copyResourceFileName("maugham.txt");
+
+ PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
+ PCollection<String> maugham = pipeline.readTextFile(maughamInputPath);
+ PTable<String, Long> joined = join(shakespeare, maugham, typeFamily);
+ Iterable<Pair<String, Long>> lines = joined.materialize();
+
+ assertPassed(lines);
+
+ pipeline.done();
+ }
+ @Rule
+ public transient TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testWritableJoin() throws Exception {
+ run(new MRPipeline(InnerJoinIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
+ }
+
+ @Test
+ public void testAvroJoin() throws Exception {
+ run(new MRPipeline(InnerJoinIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
+ }
+
+ /**
+ * Used to check that the result of the join makes sense.
+ *
+ * @param lines
+ * The result of the join.
+ */
+ public abstract void assertPassed(Iterable<Pair<String, Long>> lines);
+
+ /**
+ * @return The JoinFn to use.
+ */
+ protected abstract JoinFn<String, Long, Long> getJoinFn(PTypeFamily typeFamily);
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/join/LeftOuterJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/LeftOuterJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/LeftOuterJoinIT.java
new file mode 100644
index 0000000..4ad2a81
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/LeftOuterJoinIT.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import static org.junit.Assert.assertTrue;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.PTypeFamily;
+
+public class LeftOuterJoinIT extends JoinTester {
+ @Override
+ public void assertPassed(Iterable<Pair<String, Long>> lines) {
+ boolean passed1 = false;
+ boolean passed2 = false;
+ boolean passed3 = true;
+ for (Pair<String, Long> line : lines) {
+ if ("wretched".equals(line.first()) && 24 == line.second()) {
+ passed1 = true;
+ }
+ if ("againe".equals(line.first()) && 10 == line.second()) {
+ passed2 = true;
+ }
+ if ("Montparnasse.".equals(line.first())) {
+ passed3 = false;
+ }
+ }
+ assertTrue(passed1);
+ assertTrue(passed2);
+ assertTrue(passed3);
+ }
+
+ @Override
+ protected JoinFn<String, Long, Long> getJoinFn(PTypeFamily typeFamily) {
+ return new LeftOuterJoinFn<String, Long, Long>(typeFamily.strings(), typeFamily.longs());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/join/MapsideJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/MapsideJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/MapsideJoinIT.java
new file mode 100644
index 0000000..8bb5586
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/MapsideJoinIT.java
@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.PipelineResult;
+import org.apache.crunch.fn.FilterFns;
+import org.apache.crunch.fn.MapValuesFn;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.writable.Writables;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class MapsideJoinIT {
+
+ private static String saveTempDir;
+
+ @BeforeClass
+ public static void setUpClass(){
+
+ // Ensure a consistent temporary directory for use of the DistributedCache.
+
+ // The DistributedCache technically isn't supported when running in local mode, and the default
+ // temporary directiory "/tmp" is used as its location. This typically only causes an issue when
+ // running integration tests on Mac OS X, as OS X doesn't use "/tmp" as it's default temporary
+ // directory. The following call ensures that "/tmp" is used as the temporary directory on all platforms.
+ saveTempDir = System.setProperty("java.io.tmpdir", "/tmp");
+ }
+
+ @AfterClass
+ public static void tearDownClass(){
+ System.setProperty("java.io.tmpdir", saveTempDir);
+ }
+
+ private static class LineSplitter extends MapFn<String, Pair<Integer, String>> {
+ @Override
+ public Pair<Integer, String> map(String input) {
+ String[] fields = input.split("\\|");
+ return Pair.of(Integer.parseInt(fields[0]), fields[1]);
+ }
+ }
+
+ private static class CapOrdersFn extends MapValuesFn<Integer, String, String> {
+ @Override
+ public String map(String v) {
+ return v.toUpperCase();
+ }
+ }
+
+ private static class ConcatValuesFn extends MapValuesFn<Integer, Pair<String, String>, String> {
+ @Override
+ public String map(Pair<String, String> v) {
+ return v.toString();
+ }
+ }
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test
+ public void testMapSideJoin_MemPipeline() {
+ runMapsideJoin(MemPipeline.getInstance(), true);
+ }
+
+ @Test
+ public void testMapsideJoin_RightSideIsEmpty() throws IOException {
+ MRPipeline pipeline = new MRPipeline(MapsideJoinIT.class, tmpDir.getDefaultConfiguration());
+ PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt");
+ PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt");
+
+ PTable<Integer, String> filteredOrderTable = orderTable
+ .parallelDo(FilterFns.<Pair<Integer, String>>REJECT_ALL(), orderTable.getPTableType());
+
+ PTable<Integer, Pair<String, String>> joined = MapsideJoin.join(customerTable, filteredOrderTable);
+
+ List<Pair<Integer, Pair<String, String>>> materializedJoin = Lists.newArrayList(joined.materialize());
+
+ assertTrue(materializedJoin.isEmpty());
+ }
+
+ @Test
+ public void testMapsideJoin() throws IOException {
+ runMapsideJoin(new MRPipeline(MapsideJoinIT.class, tmpDir.getDefaultConfiguration()), false);
+ }
+
+ private void runMapsideJoin(Pipeline pipeline, boolean inMemory) {
+ PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt");
+ PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt");
+
+ PTable<Integer, String> custOrders = MapsideJoin.join(customerTable, orderTable)
+ .parallelDo("concat", new ConcatValuesFn(), Writables.tableOf(Writables.ints(), Writables.strings()));
+
+ PTable<Integer, String> ORDER_TABLE = orderTable.parallelDo(new CapOrdersFn(), orderTable.getPTableType());
+
+ PTable<Integer, Pair<String, String>> joined = MapsideJoin.join(custOrders, ORDER_TABLE);
+
+ List<Pair<Integer, Pair<String, String>>> expectedJoinResult = Lists.newArrayList();
+ expectedJoinResult.add(Pair.of(111, Pair.of("[John Doe,Corn flakes]", "CORN FLAKES")));
+ expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PAPER")));
+ expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet paper]", "TOILET PLUNGER")));
+ expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PAPER")));
+ expectedJoinResult.add(Pair.of(222, Pair.of("[Jane Doe,Toilet plunger]", "TOILET PLUNGER")));
+ expectedJoinResult.add(Pair.of(333, Pair.of("[Someone Else,Toilet brush]", "TOILET BRUSH")));
+ Iterable<Pair<Integer, Pair<String, String>>> iter = joined.materialize();
+
+ PipelineResult res = pipeline.run();
+ if (!inMemory) {
+ assertEquals(2, res.getStageResults().size());
+ }
+
+ List<Pair<Integer, Pair<String, String>>> joinedResultList = Lists.newArrayList(iter);
+ Collections.sort(joinedResultList);
+
+ assertEquals(expectedJoinResult, joinedResultList);
+ }
+
+ private PTable<Integer, String> readTable(Pipeline pipeline, String filename) {
+ try {
+ return pipeline.readTextFile(tmpDir.copyResourceFileName(filename)).parallelDo("asTable", new LineSplitter(),
+ Writables.tableOf(Writables.ints(), Writables.strings()));
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/join/MultiAvroSchemaJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/MultiAvroSchemaJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/MultiAvroSchemaJoinIT.java
new file mode 100644
index 0000000..f1ca770
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/MultiAvroSchemaJoinIT.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import static org.apache.crunch.types.avro.Avros.records;
+import static org.apache.crunch.types.avro.Avros.strings;
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.util.List;
+
+import org.apache.avro.Schema;
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.io.DatumWriter;
+import org.apache.avro.specific.SpecificDatumWriter;
+import org.apache.avro.specific.SpecificRecord;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.From;
+import org.apache.crunch.test.Employee;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+
+public class MultiAvroSchemaJoinIT {
+
+ private File personFile;
+ private File employeeFile;
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Before
+ public void setUp() throws Exception {
+ this.personFile = File.createTempFile("person", ".avro");
+ this.employeeFile = File.createTempFile("employee", ".avro");
+
+ DatumWriter<Person> pdw = new SpecificDatumWriter<Person>();
+ DataFileWriter<Person> pfw = new DataFileWriter<Person>(pdw);
+ pfw.create(Person.SCHEMA$, personFile);
+ Person p1 = new Person();
+ p1.name = "Josh";
+ p1.age = 19;
+ p1.siblingnames = ImmutableList.<CharSequence> of("Kate", "Mike");
+ pfw.append(p1);
+ Person p2 = new Person();
+ p2.name = "Kate";
+ p2.age = 17;;
+ p2.siblingnames = ImmutableList.<CharSequence> of("Josh", "Mike");
+ pfw.append(p2);
+ Person p3 = new Person();
+ p3.name = "Mike";
+ p3.age = 12;
+ p3.siblingnames = ImmutableList.<CharSequence> of("Josh", "Kate");
+ pfw.append(p3);
+ pfw.close();
+
+ DatumWriter<Employee> edw = new SpecificDatumWriter<Employee>();
+ DataFileWriter<Employee> efw = new DataFileWriter<Employee>(edw);
+ efw.create(Employee.SCHEMA$, employeeFile);
+ Employee e1 = new Employee();
+ e1.name = "Kate";
+ e1.salary = 100000;
+ e1.department = "Marketing";
+ efw.append(e1);
+ efw.close();
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ personFile.delete();
+ employeeFile.delete();
+ }
+
+ public static class NameFn<K extends SpecificRecord> extends MapFn<K, String> {
+ @Override
+ public String map(K input) {
+ Schema s = input.getSchema();
+ Schema.Field f = s.getField("name");
+ return input.get(f.pos()).toString();
+ }
+ }
+
+ @Test
+ public void testJoin() throws Exception {
+ Pipeline p = new MRPipeline(MultiAvroSchemaJoinIT.class, tmpDir.getDefaultConfiguration());
+ PCollection<Person> people = p.read(From.avroFile(personFile.getAbsolutePath(), records(Person.class)));
+ PCollection<Employee> employees = p.read(From.avroFile(employeeFile.getAbsolutePath(), records(Employee.class)));
+
+ Iterable<Pair<Person, Employee>> result = people.by(new NameFn<Person>(), strings())
+ .join(employees.by(new NameFn<Employee>(), strings())).values().materialize();
+ List<Pair<Person, Employee>> v = Lists.newArrayList(result);
+ assertEquals(1, v.size());
+ assertEquals("Kate", v.get(0).first().name.toString());
+ assertEquals("Kate", v.get(0).second().name.toString());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/lib/join/RightOuterJoinIT.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/lib/join/RightOuterJoinIT.java b/crunch-core/src/it/java/org/apache/crunch/lib/join/RightOuterJoinIT.java
new file mode 100644
index 0000000..d889b61
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/lib/join/RightOuterJoinIT.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import static org.junit.Assert.assertTrue;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.PTypeFamily;
+
+public class RightOuterJoinIT extends JoinTester {
+ @Override
+ public void assertPassed(Iterable<Pair<String, Long>> lines) {
+ boolean passed1 = false;
+ boolean passed2 = true;
+ boolean passed3 = false;
+ for (Pair<String, Long> line : lines) {
+ if ("wretched".equals(line.first()) && 24 == line.second()) {
+ passed1 = true;
+ }
+ if ("againe".equals(line.first())) {
+ passed2 = false;
+ }
+ if ("Montparnasse.".equals(line.first()) && 2 == line.second()) {
+ passed3 = true;
+ }
+ }
+ assertTrue(passed1);
+ assertTrue(passed2);
+ assertTrue(passed3);
+ }
+
+ @Override
+ protected JoinFn<String, Long, Long> getJoinFn(PTypeFamily typeFamily) {
+ return new RightOuterJoinFn<String, Long, Long>(typeFamily.strings(), typeFamily.longs());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/test/TemporaryPaths.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/test/TemporaryPaths.java b/crunch-core/src/it/java/org/apache/crunch/test/TemporaryPaths.java
new file mode 100644
index 0000000..97cf0de
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/test/TemporaryPaths.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.test;
+
+import org.apache.crunch.impl.mr.run.RuntimeParameters;
+import org.apache.hadoop.conf.Configuration;
+
+
+/**
+ * Utilities for working with {@link TemporaryPath}.
+ */
+public final class TemporaryPaths {
+
+ /**
+ * Static factory returning a {@link TemporaryPath} with adjusted
+ * {@link Configuration} properties.
+ */
+ public static TemporaryPath create() {
+ return new TemporaryPath(RuntimeParameters.TMP_DIR, "hadoop.tmp.dir");
+ }
+
+ private TemporaryPaths() {
+ // nothing
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/java/org/apache/crunch/test/Tests.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/java/org/apache/crunch/test/Tests.java b/crunch-core/src/it/java/org/apache/crunch/test/Tests.java
new file mode 100644
index 0000000..e381c1a
--- /dev/null
+++ b/crunch-core/src/it/java/org/apache/crunch/test/Tests.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.test;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.hadoop.io.Writable;
+import org.junit.runners.Parameterized.Parameters;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.io.ByteArrayDataOutput;
+import com.google.common.io.ByteStreams;
+import com.google.common.io.Resources;
+
+
+/**
+ * Utilities for integration tests.
+ */
+public final class Tests {
+
+ private Tests() {
+ // nothing
+ }
+
+ /**
+ * Get the path to and integration test resource file, as per naming convention.
+ *
+ * @param testCase The executing test case instance
+ * @param resourceName The file name of the resource
+ * @return The path to the resource (never null)
+ * @throws IllegalArgumentException Thrown if the resource doesn't exist
+ */
+ public static String pathTo(Object testCase, String resourceName) {
+ String qualifiedName = resource(testCase, resourceName);
+ return Resources.getResource(qualifiedName).getFile();
+ }
+
+ /**
+ * This doesn't check whether the resource exists!
+ *
+ * @param testCase
+ * @param resourceName
+ * @return The path to the resource (never null)
+ */
+ public static String resource(Object testCase, String resourceName) {
+ checkNotNull(testCase);
+ checkNotNull(resourceName);
+
+ // Note: We append "Data" because otherwise Eclipse would complain about the
+ // the case's class name clashing with the resource directory's name.
+ return testCase.getClass().getName().replaceAll("\\.", "/") + "Data/" + resourceName;
+ }
+
+ /**
+ * Return our two types of {@link Pipeline}s for a JUnit Parameterized test.
+ *
+ * @param testCase The executing test case's class
+ * @return The collection to return from a {@link Parameters} provider method
+ */
+ public static Collection<Object[]> pipelinesParams(Class<?> testCase) {
+ return ImmutableList.copyOf(
+ new Object[][] { { MemPipeline.getInstance() }, { new MRPipeline(testCase) }
+ });
+ }
+
+ /**
+ * Serialize the given Writable into a byte array.
+ *
+ * @param value The instance to serialize
+ * @return The serialized data
+ */
+ public static byte[] serialize(Writable value) {
+ checkNotNull(value);
+ try {
+ ByteArrayDataOutput out = ByteStreams.newDataOutput();
+ value.write(out);
+ return out.toByteArray();
+ } catch (IOException e) {
+ throw new IllegalStateException("cannot serialize", e);
+ }
+ }
+
+ /**
+ * Serialize the src Writable into a byte array, then deserialize it into dest.
+ * @param src The instance to serialize
+ * @param dest The instance to deserialize into
+ * @return dest, for convenience
+ */
+ public static <T extends Writable> T roundtrip(Writable src, T dest) {
+ checkNotNull(src);
+ checkNotNull(dest);
+ checkArgument(src != dest, "src and dest may not be the same instance");
+
+ try {
+ byte[] data = serialize(src);
+ dest.readFields(ByteStreams.newDataInput(data));
+ } catch (IOException e) {
+ throw new IllegalStateException("cannot deserialize", e);
+ }
+ return dest;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/customers.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/customers.txt b/crunch-core/src/it/resources/customers.txt
new file mode 100644
index 0000000..98f3f3d
--- /dev/null
+++ b/crunch-core/src/it/resources/customers.txt
@@ -0,0 +1,4 @@
+111|John Doe
+222|Jane Doe
+333|Someone Else
+444|Has No Orders
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/docs.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/docs.txt b/crunch-core/src/it/resources/docs.txt
new file mode 100644
index 0000000..90a3f65
--- /dev/null
+++ b/crunch-core/src/it/resources/docs.txt
@@ -0,0 +1,6 @@
+A this doc has this text
+A and this text as well
+A but also this
+B this doc has some text
+B but not as much as the last
+B doc
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/emptyTextFile.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/emptyTextFile.txt b/crunch-core/src/it/resources/emptyTextFile.txt
new file mode 100644
index 0000000..e69de29
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/letters.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/letters.txt b/crunch-core/src/it/resources/letters.txt
new file mode 100644
index 0000000..916bfc9
--- /dev/null
+++ b/crunch-core/src/it/resources/letters.txt
@@ -0,0 +1,2 @@
+a
+bb
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/log4j.properties b/crunch-core/src/it/resources/log4j.properties
new file mode 100644
index 0000000..5d144a0
--- /dev/null
+++ b/crunch-core/src/it/resources/log4j.properties
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ***** Set root logger level to INFO and its only appender to A.
+log4j.logger.org.apache.crunch=info, A
+
+# Log warnings on Hadoop for the local runner when testing
+log4j.logger.org.apache.hadoop=warn, A
+# Except for Configuration, which is chatty.
+log4j.logger.org.apache.hadoop.conf.Configuration=error, A
+
+# ***** A is set to be a ConsoleAppender.
+log4j.appender.A=org.apache.log4j.ConsoleAppender
+# ***** A uses PatternLayout.
+log4j.appender.A.layout=org.apache.log4j.PatternLayout
+log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
[18/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/maugham.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/maugham.txt b/crunch/src/it/resources/maugham.txt
deleted file mode 100644
index 16c45e8..0000000
--- a/crunch/src/it/resources/maugham.txt
+++ /dev/null
@@ -1,29112 +0,0 @@
-The Project Gutenberg EBook of Of Human Bondage, by W. Somerset Maugham
-
-This eBook is for the use of anyone anywhere at no cost and with
-almost no restrictions whatsoever. You may copy it, give it away or
-re-use it under the terms of the Project Gutenberg License included
-with this eBook or online at www.gutenberg.net
-
-
-Title: Of Human Bondage
-
-Author: W. Somerset Maugham
-
-Release Date: May 6, 2008 [EBook #351]
-
-Language: English
-
-
-*** START OF THIS PROJECT GUTENBERG EBOOK OF HUMAN BONDAGE ***
-
-
-
-
-
-
-
-
-
-
-
-
-OF HUMAN BONDAGE
-
-
-BY
-
-W. SOMERSET MAUGHAM
-
-
-
-
-I
-
-The day broke gray and dull. The clouds hung heavily, and there was a
-rawness in the air that suggested snow. A woman servant came into a room
-in which a child was sleeping and drew the curtains. She glanced
-mechanically at the house opposite, a stucco house with a portico, and
-went to the child's bed.
-
-"Wake up, Philip," she said.
-
-She pulled down the bed-clothes, took him in her arms, and carried him
-downstairs. He was only half awake.
-
-"Your mother wants you," she said.
-
-She opened the door of a room on the floor below and took the child over
-to a bed in which a woman was lying. It was his mother. She stretched out
-her arms, and the child nestled by her side. He did not ask why he had
-been awakened. The woman kissed his eyes, and with thin, small hands felt
-the warm body through his white flannel nightgown. She pressed him closer
-to herself.
-
-"Are you sleepy, darling?" she said.
-
-Her voice was so weak that it seemed to come already from a great
-distance. The child did not answer, but smiled comfortably. He was very
-happy in the large, warm bed, with those soft arms about him. He tried to
-make himself smaller still as he cuddled up against his mother, and he
-kissed her sleepily. In a moment he closed his eyes and was fast asleep.
-The doctor came forwards and stood by the bed-side.
-
-"Oh, don't take him away yet," she moaned.
-
-The doctor, without answering, looked at her gravely. Knowing she would
-not be allowed to keep the child much longer, the woman kissed him again;
-and she passed her hand down his body till she came to his feet; she held
-the right foot in her hand and felt the five small toes; and then slowly
-passed her hand over the left one. She gave a sob.
-
-"What's the matter?" said the doctor. "You're tired."
-
-She shook her head, unable to speak, and the tears rolled down her cheeks.
-The doctor bent down.
-
-"Let me take him."
-
-She was too weak to resist his wish, and she gave the child up. The doctor
-handed him back to his nurse.
-
-"You'd better put him back in his own bed."
-
-"Very well, sir." The little boy, still sleeping, was taken away. His
-mother sobbed now broken-heartedly.
-
-"What will happen to him, poor child?"
-
-The monthly nurse tried to quiet her, and presently, from exhaustion, the
-crying ceased. The doctor walked to a table on the other side of the room,
-upon which, under a towel, lay the body of a still-born child. He lifted
-the towel and looked. He was hidden from the bed by a screen, but the
-woman guessed what he was doing.
-
-"Was it a girl or a boy?" she whispered to the nurse.
-
-"Another boy."
-
-The woman did not answer. In a moment the child's nurse came back. She
-approached the bed.
-
-"Master Philip never woke up," she said. There was a pause. Then the
-doctor felt his patient's pulse once more.
-
-"I don't think there's anything I can do just now," he said. "I'll call
-again after breakfast."
-
-"I'll show you out, sir," said the child's nurse.
-
-They walked downstairs in silence. In the hall the doctor stopped.
-
-"You've sent for Mrs. Carey's brother-in-law, haven't you?"
-
-"Yes, sir."
-
-"D'you know at what time he'll be here?"
-
-"No, sir, I'm expecting a telegram."
-
-"What about the little boy? I should think he'd be better out of the way."
-
-"Miss Watkin said she'd take him, sir."
-
-"Who's she?"
-
-"She's his godmother, sir. D'you think Mrs. Carey will get over it, sir?"
-
-The doctor shook his head.
-
-
-
-II
-
-It was a week later. Philip was sitting on the floor in the drawing-room
-at Miss Watkin's house in Onslow gardens. He was an only child and used to
-amusing himself. The room was filled with massive furniture, and on each
-of the sofas were three big cushions. There was a cushion too in each
-arm-chair. All these he had taken and, with the help of the gilt rout
-chairs, light and easy to move, had made an elaborate cave in which he
-could hide himself from the Red Indians who were lurking behind the
-curtains. He put his ear to the floor and listened to the herd of
-buffaloes that raced across the prairie. Presently, hearing the door open,
-he held his breath so that he might not be discovered; but a violent hand
-piled away a chair and the cushions fell down.
-
-"You naughty boy, Miss Watkin WILL be cross with you."
-
-"Hulloa, Emma!" he said.
-
-The nurse bent down and kissed him, then began to shake out the cushions,
-and put them back in their places.
-
-"Am I to come home?" he asked.
-
-"Yes, I've come to fetch you."
-
-"You've got a new dress on."
-
-It was in eighteen-eighty-five, and she wore a bustle. Her gown was of
-black velvet, with tight sleeves and sloping shoulders, and the skirt had
-three large flounces. She wore a black bonnet with velvet strings. She
-hesitated. The question she had expected did not come, and so she could
-not give the answer she had prepared.
-
-"Aren't you going to ask how your mamma is?" she said at length.
-
-"Oh, I forgot. How is mamma?"
-
-Now she was ready.
-
-"Your mamma is quite well and happy."
-
-"Oh, I am glad."
-
-"Your mamma's gone away. You won't ever see her any more." Philip did not
-know what she meant.
-
-"Why not?"
-
-"Your mamma's in heaven."
-
-She began to cry, and Philip, though he did not quite understand, cried
-too. Emma was a tall, big-boned woman, with fair hair and large features.
-She came from Devonshire and, notwithstanding her many years of service in
-London, had never lost the breadth of her accent. Her tears increased her
-emotion, and she pressed the little boy to her heart. She felt vaguely the
-pity of that child deprived of the only love in the world that is quite
-unselfish. It seemed dreadful that he must be handed over to strangers.
-But in a little while she pulled herself together.
-
-"Your Uncle William is waiting in to see you," she said. "Go and say
-good-bye to Miss Watkin, and we'll go home."
-
-"I don't want to say good-bye," he answered, instinctively anxious to hide
-his tears.
-
-"Very well, run upstairs and get your hat."
-
-He fetched it, and when he came down Emma was waiting for him in the hall.
-He heard the sound of voices in the study behind the dining-room. He
-paused. He knew that Miss Watkin and her sister were talking to friends,
-and it seemed to him--he was nine years old--that if he went in they would
-be sorry for him.
-
-"I think I'll go and say good-bye to Miss Watkin."
-
-"I think you'd better," said Emma.
-
-"Go in and tell them I'm coming," he said.
-
-He wished to make the most of his opportunity. Emma knocked at the door
-and walked in. He heard her speak.
-
-"Master Philip wants to say good-bye to you, miss."
-
-There was a sudden hush of the conversation, and Philip limped in.
-Henrietta Watkin was a stout woman, with a red face and dyed hair. In
-those days to dye the hair excited comment, and Philip had heard much
-gossip at home when his godmother's changed colour. She lived with an
-elder sister, who had resigned herself contentedly to old age. Two ladies,
-whom Philip did not know, were calling, and they looked at him curiously.
-
-"My poor child," said Miss Watkin, opening her arms.
-
-She began to cry. Philip understood now why she had not been in to
-luncheon and why she wore a black dress. She could not speak.
-
-"I've got to go home," said Philip, at last.
-
-He disengaged himself from Miss Watkin's arms, and she kissed him again.
-Then he went to her sister and bade her good-bye too. One of the strange
-ladies asked if she might kiss him, and he gravely gave her permission.
-Though crying, he keenly enjoyed the sensation he was causing; he would
-have been glad to stay a little longer to be made much of, but felt they
-expected him to go, so he said that Emma was waiting for him. He went out
-of the room. Emma had gone downstairs to speak with a friend in the
-basement, and he waited for her on the landing. He heard Henrietta
-Watkin's voice.
-
-"His mother was my greatest friend. I can't bear to think that she's
-dead."
-
-"You oughtn't to have gone to the funeral, Henrietta," said her sister. "I
-knew it would upset you."
-
-Then one of the strangers spoke.
-
-"Poor little boy, it's dreadful to think of him quite alone in the world.
-I see he limps."
-
-"Yes, he's got a club-foot. It was such a grief to his mother."
-
-Then Emma came back. They called a hansom, and she told the driver where
-to go.
-
-
-
-III
-
-
-When they reached the house Mrs. Carey had died in--it was in a dreary,
-respectable street between Notting Hill Gate and High Street,
-Kensington--Emma led Philip into the drawing-room. His uncle was writing
-letters of thanks for the wreaths which had been sent. One of them, which
-had arrived too late for the funeral, lay in its cardboard box on the
-hall-table.
-
-"Here's Master Philip," said Emma.
-
-Mr. Carey stood up slowly and shook hands with the little boy. Then on
-second thoughts he bent down and kissed his forehead. He was a man of
-somewhat less than average height, inclined to corpulence, with his hair,
-worn long, arranged over the scalp so as to conceal his baldness. He was
-clean-shaven. His features were regular, and it was possible to imagine
-that in his youth he had been good-looking. On his watch-chain he wore a
-gold cross.
-
-"You're going to live with me now, Philip," said Mr. Carey. "Shall you
-like that?"
-
-Two years before Philip had been sent down to stay at the vicarage after
-an attack of chicken-pox; but there remained with him a recollection of an
-attic and a large garden rather than of his uncle and aunt.
-
-"Yes."
-
-"You must look upon me and your Aunt Louisa as your father and mother."
-
-The child's mouth trembled a little, he reddened, but did not answer.
-
-"Your dear mother left you in my charge."
-
-Mr. Carey had no great ease in expressing himself. When the news came that
-his sister-in-law was dying, he set off at once for London, but on the way
-thought of nothing but the disturbance in his life that would be caused if
-her death forced him to undertake the care of her son. He was well over
-fifty, and his wife, to whom he had been married for thirty years, was
-childless; he did not look forward with any pleasure to the presence of a
-small boy who might be noisy and rough. He had never much liked his
-sister-in-law.
-
-"I'm going to take you down to Blackstable tomorrow," he said.
-
-"With Emma?"
-
-The child put his hand in hers, and she pressed it.
-
-"I'm afraid Emma must go away," said Mr. Carey.
-
-"But I want Emma to come with me."
-
-Philip began to cry, and the nurse could not help crying too. Mr. Carey
-looked at them helplessly.
-
-"I think you'd better leave me alone with Master Philip for a moment."
-
-"Very good, sir."
-
-Though Philip clung to her, she released herself gently. Mr. Carey took
-the boy on his knee and put his arm round him.
-
-"You mustn't cry," he said. "You're too old to have a nurse now. We must
-see about sending you to school."
-
-"I want Emma to come with me," the child repeated.
-
-"It costs too much money, Philip. Your father didn't leave very much, and
-I don't know what's become of it. You must look at every penny you spend."
-
-Mr. Carey had called the day before on the family solicitor. Philip's
-father was a surgeon in good practice, and his hospital appointments
-suggested an established position; so that it was a surprise on his sudden
-death from blood-poisoning to find that he had left his widow little more
-than his life insurance and what could be got for the lease of their house
-in Bruton Street. This was six months ago; and Mrs. Carey, already in
-delicate health, finding herself with child, had lost her head and
-accepted for the lease the first offer that was made. She stored her
-furniture, and, at a rent which the parson thought outrageous, took a
-furnished house for a year, so that she might suffer from no inconvenience
-till her child was born. But she had never been used to the management of
-money, and was unable to adapt her expenditure to her altered
-circumstances. The little she had slipped through her fingers in one way
-and another, so that now, when all expenses were paid, not much more than
-two thousand pounds remained to support the boy till he was able to earn
-his own living. It was impossible to explain all this to Philip and he was
-sobbing still.
-
-"You'd better go to Emma," Mr. Carey said, feeling that she could console
-the child better than anyone.
-
-Without a word Philip slipped off his uncle's knee, but Mr. Carey stopped
-him.
-
-"We must go tomorrow, because on Saturday I've got to prepare my sermon,
-and you must tell Emma to get your things ready today. You can bring all
-your toys. And if you want anything to remember your father and mother by
-you can take one thing for each of them. Everything else is going to be
-sold."
-
-The boy slipped out of the room. Mr. Carey was unused to work, and he
-turned to his correspondence with resentment. On one side of the desk was
-a bundle of bills, and these filled him with irritation. One especially
-seemed preposterous. Immediately after Mrs. Carey's death Emma had ordered
-from the florist masses of white flowers for the room in which the dead
-woman lay. It was sheer waste of money. Emma took far too much upon
-herself. Even if there had been no financial necessity, he would have
-dismissed her.
-
-But Philip went to her, and hid his face in her bosom, and wept as though
-his heart would break. And she, feeling that he was almost her own
-son--she had taken him when he was a month old--consoled him with soft
-words. She promised that she would come and see him sometimes, and that
-she would never forget him; and she told him about the country he was
-going to and about her own home in Devonshire--her father kept a turnpike
-on the high-road that led to Exeter, and there were pigs in the sty, and
-there was a cow, and the cow had just had a calf--till Philip forgot his
-tears and grew excited at the thought of his approaching journey.
-Presently she put him down, for there was much to be done, and he helped
-her to lay out his clothes on the bed. She sent him into the nursery to
-gather up his toys, and in a little while he was playing happily.
-
-But at last he grew tired of being alone and went back to the bed-room, in
-which Emma was now putting his things into a big tin box; he remembered
-then that his uncle had said he might take something to remember his
-father and mother by. He told Emma and asked her what he should take.
-
-"You'd better go into the drawing-room and see what you fancy."
-
-"Uncle William's there."
-
-"Never mind that. They're your own things now."
-
-Philip went downstairs slowly and found the door open. Mr. Carey had left
-the room. Philip walked slowly round. They had been in the house so short
-a time that there was little in it that had a particular interest to him.
-It was a stranger's room, and Philip saw nothing that struck his fancy.
-But he knew which were his mother's things and which belonged to the
-landlord, and presently fixed on a little clock that he had once heard his
-mother say she liked. With this he walked again rather disconsolately
-upstairs. Outside the door of his mother's bed-room he stopped and
-listened. Though no one had told him not to go in, he had a feeling that
-it would be wrong to do so; he was a little frightened, and his heart beat
-uncomfortably; but at the same time something impelled him to turn the
-handle. He turned it very gently, as if to prevent anyone within from
-hearing, and then slowly pushed the door open. He stood on the threshold
-for a moment before he had the courage to enter. He was not frightened
-now, but it seemed strange. He closed the door behind him. The blinds were
-drawn, and the room, in the cold light of a January afternoon, was dark.
-On the dressing-table were Mrs. Carey's brushes and the hand mirror. In a
-little tray were hairpins. There was a photograph of himself on the
-chimney-piece and one of his father. He had often been in the room when
-his mother was not in it, but now it seemed different. There was something
-curious in the look of the chairs. The bed was made as though someone were
-going to sleep in it that night, and in a case on the pillow was a
-night-dress.
-
-Philip opened a large cupboard filled with dresses and, stepping in, took
-as many of them as he could in his arms and buried his face in them. They
-smelt of the scent his mother used. Then he pulled open the drawers,
-filled with his mother's things, and looked at them: there were lavender
-bags among the linen, and their scent was fresh and pleasant. The
-strangeness of the room left it, and it seemed to him that his mother had
-just gone out for a walk. She would be in presently and would come
-upstairs to have nursery tea with him. And he seemed to feel her kiss on
-his lips.
-
-It was not true that he would never see her again. It was not true simply
-because it was impossible. He climbed up on the bed and put his head on
-the pillow. He lay there quite still.
-
-
-
-IV
-
-
-Philip parted from Emma with tears, but the journey to Blackstable amused
-him, and, when they arrived, he was resigned and cheerful. Blackstable was
-sixty miles from London. Giving their luggage to a porter, Mr. Carey set
-out to walk with Philip to the vicarage; it took them little more than
-five minutes, and, when they reached it, Philip suddenly remembered the
-gate. It was red and five-barred: it swung both ways on easy hinges; and
-it was possible, though forbidden, to swing backwards and forwards on it.
-They walked through the garden to the front-door. This was only used by
-visitors and on Sundays, and on special occasions, as when the Vicar went
-up to London or came back. The traffic of the house took place through a
-side-door, and there was a back door as well for the gardener and for
-beggars and tramps. It was a fairly large house of yellow brick, with a
-red roof, built about five and twenty years before in an ecclesiastical
-style. The front-door was like a church porch, and the drawing-room
-windows were gothic.
-
-Mrs. Carey, knowing by what train they were coming, waited in the
-drawing-room and listened for the click of the gate. When she heard it she
-went to the door.
-
-"There's Aunt Louisa," said Mr. Carey, when he saw her. "Run and give her
-a kiss."
-
-Philip started to run, awkwardly, trailing his club-foot, and then
-stopped. Mrs. Carey was a little, shrivelled woman of the same age as her
-husband, with a face extraordinarily filled with deep wrinkles, and pale
-blue eyes. Her gray hair was arranged in ringlets according to the fashion
-of her youth. She wore a black dress, and her only ornament was a gold
-chain, from which hung a cross. She had a shy manner and a gentle voice.
-
-"Did you walk, William?" she said, almost reproachfully, as she kissed her
-husband.
-
-"I didn't think of it," he answered, with a glance at his nephew.
-
-"It didn't hurt you to walk, Philip, did it?" she asked the child.
-
-"No. I always walk."
-
-He was a little surprised at their conversation. Aunt Louisa told him to
-come in, and they entered the hall. It was paved with red and yellow
-tiles, on which alternately were a Greek Cross and the Lamb of God. An
-imposing staircase led out of the hall. It was of polished pine, with a
-peculiar smell, and had been put in because fortunately, when the church
-was reseated, enough wood remained over. The balusters were decorated with
-emblems of the Four Evangelists.
-
-"I've had the stove lighted as I thought you'd be cold after your
-journey," said Mrs. Carey.
-
-It was a large black stove that stood in the hall and was only lighted if
-the weather was very bad and the Vicar had a cold. It was not lighted if
-Mrs. Carey had a cold. Coal was expensive. Besides, Mary Ann, the maid,
-didn't like fires all over the place. If they wanted all them fires they
-must keep a second girl. In the winter Mr. and Mrs. Carey lived in the
-dining-room so that one fire should do, and in the summer they could not
-get out of the habit, so the drawing-room was used only by Mr. Carey on
-Sunday afternoons for his nap. But every Saturday he had a fire in the
-study so that he could write his sermon.
-
-Aunt Louisa took Philip upstairs and showed him into a tiny bed-room that
-looked out on the drive. Immediately in front of the window was a large
-tree, which Philip remembered now because the branches were so low that it
-was possible to climb quite high up it.
-
-"A small room for a small boy," said Mrs. Carey. "You won't be frightened
-at sleeping alone?"
-
-"Oh, no."
-
-On his first visit to the vicarage he had come with his nurse, and Mrs.
-Carey had had little to do with him. She looked at him now with some
-uncertainty.
-
-"Can you wash your own hands, or shall I wash them for you?"
-
-"I can wash myself," he answered firmly.
-
-"Well, I shall look at them when you come down to tea," said Mrs. Carey.
-
-She knew nothing about children. After it was settled that Philip should
-come down to Blackstable, Mrs. Carey had thought much how she should treat
-him; she was anxious to do her duty; but now he was there she found
-herself just as shy of him as he was of her. She hoped he would not be
-noisy and rough, because her husband did not like rough and noisy boys.
-Mrs. Carey made an excuse to leave Philip alone, but in a moment came back
-and knocked at the door; she asked him, without coming in, if he could
-pour out the water himself. Then she went downstairs and rang the bell for
-tea.
-
-The dining-room, large and well-proportioned, had windows on two sides of
-it, with heavy curtains of red rep; there was a big table in the middle;
-and at one end an imposing mahogany sideboard with a looking-glass in it.
-In one corner stood a harmonium. On each side of the fireplace were chairs
-covered in stamped leather, each with an antimacassar; one had arms and
-was called the husband, and the other had none and was called the wife.
-Mrs. Carey never sat in the arm-chair: she said she preferred a chair that
-was not too comfortable; there was always a lot to do, and if her chair
-had had arms she might not be so ready to leave it.
-
-Mr. Carey was making up the fire when Philip came in, and he pointed out
-to his nephew that there were two pokers. One was large and bright and
-polished and unused, and was called the Vicar; and the other, which was
-much smaller and had evidently passed through many fires, was called the
-Curate.
-
-"What are we waiting for?" said Mr. Carey.
-
-"I told Mary Ann to make you an egg. I thought you'd be hungry after your
-journey."
-
-Mrs. Carey thought the journey from London to Blackstable very tiring. She
-seldom travelled herself, for the living was only three hundred a year,
-and, when her husband wanted a holiday, since there was not money for two,
-he went by himself. He was very fond of Church Congresses and usually
-managed to go up to London once a year; and once he had been to Paris for
-the exhibition, and two or three times to Switzerland. Mary Ann brought in
-the egg, and they sat down. The chair was much too low for Philip, and for
-a moment neither Mr. Carey nor his wife knew what to do.
-
-"I'll put some books under him," said Mary Ann.
-
-She took from the top of the harmonium the large Bible and the prayer-book
-from which the Vicar was accustomed to read prayers, and put them on
-Philip's chair.
-
-"Oh, William, he can't sit on the Bible," said Mrs. Carey, in a shocked
-tone. "Couldn't you get him some books out of the study?"
-
-Mr. Carey considered the question for an instant.
-
-"I don't think it matters this once if you put the prayer-book on the top,
-Mary Ann," he said. "The book of Common Prayer is the composition of men
-like ourselves. It has no claim to divine authorship."
-
-"I hadn't thought of that, William," said Aunt Louisa.
-
-Philip perched himself on the books, and the Vicar, having said grace, cut
-the top off his egg.
-
-"There," he said, handing it to Philip, "you can eat my top if you like."
-
-Philip would have liked an egg to himself, but he was not offered one, so
-took what he could.
-
-"How have the chickens been laying since I went away?" asked the Vicar.
-
-"Oh, they've been dreadful, only one or two a day."
-
-"How did you like that top, Philip?" asked his uncle.
-
-"Very much, thank you."
-
-"You shall have another one on Sunday afternoon."
-
-Mr. Carey always had a boiled egg at tea on Sunday, so that he might be
-fortified for the evening service.
-
-
-
-V
-
-
-Philip came gradually to know the people he was to live with, and by
-fragments of conversation, some of it not meant for his ears, learned a
-good deal both about himself and about his dead parents. Philip's father
-had been much younger than the Vicar of Blackstable. After a brilliant
-career at St. Luke's Hospital he was put on the staff, and presently began
-to earn money in considerable sums. He spent it freely. When the parson
-set about restoring his church and asked his brother for a subscription,
-he was surprised by receiving a couple of hundred pounds: Mr. Carey,
-thrifty by inclination and economical by necessity, accepted it with
-mingled feelings; he was envious of his brother because he could afford to
-give so much, pleased for the sake of his church, and vaguely irritated by
-a generosity which seemed almost ostentatious. Then Henry Carey married a
-patient, a beautiful girl but penniless, an orphan with no near relations,
-but of good family; and there was an array of fine friends at the wedding.
-The parson, on his visits to her when he came to London, held himself with
-reserve. He felt shy with her and in his heart he resented her great
-beauty: she dressed more magnificently than became the wife of a
-hardworking surgeon; and the charming furniture of her house, the flowers
-among which she lived even in winter, suggested an extravagance which he
-deplored. He heard her talk of entertainments she was going to; and, as he
-told his wife on getting home again, it was impossible to accept
-hospitality without making some return. He had seen grapes in the
-dining-room that must have cost at least eight shillings a pound; and at
-luncheon he had been given asparagus two months before it was ready in the
-vicarage garden. Now all he had anticipated was come to pass: the Vicar
-felt the satisfaction of the prophet who saw fire and brimstone consume
-the city which would not mend its way to his warning. Poor Philip was
-practically penniless, and what was the good of his mother's fine friends
-now? He heard that his father's extravagance was really criminal, and it
-was a mercy that Providence had seen fit to take his dear mother to
-itself: she had no more idea of money than a child.
-
-When Philip had been a week at Blackstable an incident happened which
-seemed to irritate his uncle very much. One morning he found on the
-breakfast table a small packet which had been sent on by post from the
-late Mrs. Carey's house in London. It was addressed to her. When the
-parson opened it he found a dozen photographs of Mrs. Carey. They showed
-the head and shoulders only, and her hair was more plainly done than
-usual, low on the forehead, which gave her an unusual look; the face was
-thin and worn, but no illness could impair the beauty of her features.
-There was in the large dark eyes a sadness which Philip did not remember.
-The first sight of the dead woman gave Mr. Carey a little shock, but this
-was quickly followed by perplexity. The photographs seemed quite recent,
-and he could not imagine who had ordered them.
-
-"D'you know anything about these, Philip?" he asked.
-
-"I remember mamma said she'd been taken," he answered. "Miss Watkin
-scolded her.... She said: I wanted the boy to have something to remember
-me by when he grows up."
-
-Mr. Carey looked at Philip for an instant. The child spoke in a clear
-treble. He recalled the words, but they meant nothing to him.
-
-"You'd better take one of the photographs and keep it in your room," said
-Mr. Carey. "I'll put the others away."
-
-He sent one to Miss Watkin, and she wrote and explained how they came to
-be taken.
-
-One day Mrs. Carey was lying in bed, but she was feeling a little better
-than usual, and the doctor in the morning had seemed hopeful; Emma had
-taken the child out, and the maids were downstairs in the basement:
-suddenly Mrs. Carey felt desperately alone in the world. A great fear
-seized her that she would not recover from the confinement which she was
-expecting in a fortnight. Her son was nine years old. How could he be
-expected to remember her? She could not bear to think that he would grow
-up and forget, forget her utterly; and she had loved him so passionately,
-because he was weakly and deformed, and because he was her child. She had
-no photographs of herself taken since her marriage, and that was ten years
-before. She wanted her son to know what she looked like at the end. He
-could not forget her then, not forget utterly. She knew that if she called
-her maid and told her she wanted to get up, the maid would prevent her,
-and perhaps send for the doctor, and she had not the strength now to
-struggle or argue. She got out of bed and began to dress herself. She had
-been on her back so long that her legs gave way beneath her, and then the
-soles of her feet tingled so that she could hardly bear to put them to the
-ground. But she went on. She was unused to doing her own hair and, when
-she raised her arms and began to brush it, she felt faint. She could never
-do it as her maid did. It was beautiful hair, very fine, and of a deep
-rich gold. Her eyebrows were straight and dark. She put on a black skirt,
-but chose the bodice of the evening dress which she liked best: it was of
-a white damask which was fashionable in those days. She looked at herself
-in the glass. Her face was very pale, but her skin was clear: she had
-never had much colour, and this had always made the redness of her
-beautiful mouth emphatic. She could not restrain a sob. But she could not
-afford to be sorry for herself; she was feeling already desperately tired;
-and she put on the furs which Henry had given her the Christmas
-before--she had been so proud of them and so happy then--and slipped
-downstairs with beating heart. She got safely out of the house and drove
-to a photographer. She paid for a dozen photographs. She was obliged to
-ask for a glass of water in the middle of the sitting; and the assistant,
-seeing she was ill, suggested that she should come another day, but she
-insisted on staying till the end. At last it was finished, and she drove
-back again to the dingy little house in Kensington which she hated with
-all her heart. It was a horrible house to die in.
-
-She found the front door open, and when she drove up the maid and Emma ran
-down the steps to help her. They had been frightened when they found her
-room empty. At first they thought she must have gone to Miss Watkin, and
-the cook was sent round. Miss Watkin came back with her and was waiting
-anxiously in the drawing-room. She came downstairs now full of anxiety and
-reproaches; but the exertion had been more than Mrs. Carey was fit for,
-and when the occasion for firmness no longer existed she gave way. She
-fell heavily into Emma's arms and was carried upstairs. She remained
-unconscious for a time that seemed incredibly long to those that watched
-her, and the doctor, hurriedly sent for, did not come. It was next day,
-when she was a little better, that Miss Watkin got some explanation out of
-her. Philip was playing on the floor of his mother's bed-room, and neither
-of the ladies paid attention to him. He only understood vaguely what they
-were talking about, and he could not have said why those words remained in
-his memory.
-
-"I wanted the boy to have something to remember me by when he grows up."
-
-"I can't make out why she ordered a dozen," said Mr. Carey. "Two would
-have done."
-
-
-
-VI
-
-
-One day was very like another at the vicarage.
-
-Soon after breakfast Mary Ann brought in The Times. Mr. Carey shared it
-with two neighbours. He had it from ten till one, when the gardener took
-it over to Mr. Ellis at the Limes, with whom it remained till seven; then
-it was taken to Miss Brooks at the Manor House, who, since she got it
-late, had the advantage of keeping it. In summer Mrs. Carey, when she was
-making jam, often asked her for a copy to cover the pots with. When the
-Vicar settled down to his paper his wife put on her bonnet and went out to
-do the shopping. Philip accompanied her. Blackstable was a fishing
-village. It consisted of a high street in which were the shops, the bank,
-the doctor's house, and the houses of two or three coalship owners; round
-the little harbor were shabby streets in which lived fishermen and poor
-people; but since they went to chapel they were of no account. When Mrs.
-Carey passed the dissenting ministers in the street she stepped over to
-the other side to avoid meeting them, but if there was not time for this
-fixed her eyes on the pavement. It was a scandal to which the Vicar had
-never resigned himself that there were three chapels in the High Street:
-he could not help feeling that the law should have stepped in to prevent
-their erection. Shopping in Blackstable was not a simple matter; for
-dissent, helped by the fact that the parish church was two miles from the
-town, was very common; and it was necessary to deal only with churchgoers;
-Mrs. Carey knew perfectly that the vicarage custom might make all the
-difference to a tradesman's faith. There were two butchers who went to
-church, and they would not understand that the Vicar could not deal with
-both of them at once; nor were they satisfied with his simple plan of
-going for six months to one and for six months to the other. The butcher
-who was not sending meat to the vicarage constantly threatened not to come
-to church, and the Vicar was sometimes obliged to make a threat: it was
-very wrong of him not to come to church, but if he carried iniquity
-further and actually went to chapel, then of course, excellent as his meat
-was, Mr. Carey would be forced to leave him for ever. Mrs. Carey often
-stopped at the bank to deliver a message to Josiah Graves, the manager,
-who was choir-master, treasurer, and churchwarden. He was a tall, thin man
-with a sallow face and a long nose; his hair was very white, and to Philip
-he seemed extremely old. He kept the parish accounts, arranged the treats
-for the choir and the schools; though there was no organ in the parish
-church, it was generally considered (in Blackstable) that the choir he led
-was the best in Kent; and when there was any ceremony, such as a visit
-from the Bishop for confirmation or from the Rural Dean to preach at the
-Harvest Thanksgiving, he made the necessary preparations. But he had no
-hesitation in doing all manner of things without more than a perfunctory
-consultation with the Vicar, and the Vicar, though always ready to be
-saved trouble, much resented the churchwarden's managing ways. He really
-seemed to look upon himself as the most important person in the parish.
-Mr. Carey constantly told his wife that if Josiah Graves did not take care
-he would give him a good rap over the knuckles one day; but Mrs. Carey
-advised him to bear with Josiah Graves: he meant well, and it was not his
-fault if he was not quite a gentleman. The Vicar, finding his comfort in
-the practice of a Christian virtue, exercised forbearance; but he revenged
-himself by calling the churchwarden Bismarck behind his back.
-
-Once there had been a serious quarrel between the pair, and Mrs. Carey
-still thought of that anxious time with dismay. The Conservative candidate
-had announced his intention of addressing a meeting at Blackstable; and
-Josiah Graves, having arranged that it should take place in the Mission
-Hall, went to Mr. Carey and told him that he hoped he would say a few
-words. It appeared that the candidate had asked Josiah Graves to take the
-chair. This was more than Mr. Carey could put up with. He had firm views
-upon the respect which was due to the cloth, and it was ridiculous for a
-churchwarden to take the chair at a meeting when the Vicar was there. He
-reminded Josiah Graves that parson meant person, that is, the vicar was
-the person of the parish. Josiah Graves answered that he was the first to
-recognise the dignity of the church, but this was a matter of politics,
-and in his turn he reminded the Vicar that their Blessed Saviour had
-enjoined upon them to render unto Caesar the things that were Caesar's. To
-this Mr. Carey replied that the devil could quote scripture to his
-purpose, himself had sole authority over the Mission Hall, and if he were
-not asked to be chairman he would refuse the use of it for a political
-meeting. Josiah Graves told Mr. Carey that he might do as he chose, and
-for his part he thought the Wesleyan Chapel would be an equally suitable
-place. Then Mr. Carey said that if Josiah Graves set foot in what was
-little better than a heathen temple he was not fit to be churchwarden in
-a Christian parish. Josiah Graves thereupon resigned all his offices, and
-that very evening sent to the church for his cassock and surplice. His
-sister, Miss Graves, who kept house for him, gave up her secretaryship of
-the Maternity Club, which provided the pregnant poor with flannel, baby
-linen, coals, and five shillings. Mr. Carey said he was at last master in
-his own house. But soon he found that he was obliged to see to all sorts
-of things that he knew nothing about; and Josiah Graves, after the first
-moment of irritation, discovered that he had lost his chief interest in
-life. Mrs. Carey and Miss Graves were much distressed by the quarrel; they
-met after a discreet exchange of letters, and made up their minds to put
-the matter right: they talked, one to her husband, the other to her
-brother, from morning till night; and since they were persuading these
-gentlemen to do what in their hearts they wanted, after three weeks of
-anxiety a reconciliation was effected. It was to both their interests, but
-they ascribed it to a common love for their Redeemer. The meeting was held
-at the Mission Hall, and the doctor was asked to be chairman. Mr. Carey
-and Josiah Graves both made speeches.
-
-When Mrs. Carey had finished her business with the banker, she generally
-went upstairs to have a little chat with his sister; and while the ladies
-talked of parish matters, the curate or the new bonnet of Mrs. Wilson--Mr.
-Wilson was the richest man in Blackstable, he was thought to have at least
-five hundred a year, and he had married his cook--Philip sat demurely in
-the stiff parlour, used only to receive visitors, and busied himself with
-the restless movements of goldfish in a bowl. The windows were never
-opened except to air the room for a few minutes in the morning, and it had
-a stuffy smell which seemed to Philip to have a mysterious connection with
-banking.
-
-Then Mrs. Carey remembered that she had to go to the grocer, and they
-continued their way. When the shopping was done they often went down a
-side street of little houses, mostly of wood, in which fishermen dwelt
-(and here and there a fisherman sat on his doorstep mending his nets, and
-nets hung to dry upon the doors), till they came to a small beach, shut in
-on each side by warehouses, but with a view of the sea. Mrs. Carey stood
-for a few minutes and looked at it, it was turbid and yellow, [and who
-knows what thoughts passed through her mind?] while Philip searched for
-flat stones to play ducks and drakes. Then they walked slowly back. They
-looked into the post office to get the right time, nodded to Mrs. Wigram
-the doctor's wife, who sat at her window sewing, and so got home.
-
-Dinner was at one o'clock; and on Monday, Tuesday, and Wednesday it
-consisted of beef, roast, hashed, and minced, and on Thursday, Friday, and
-Saturday of mutton. On Sunday they ate one of their own chickens. In the
-afternoon Philip did his lessons, He was taught Latin and mathematics by
-his uncle who knew neither, and French and the piano by his aunt. Of
-French she was ignorant, but she knew the piano well enough to accompany
-the old-fashioned songs she had sung for thirty years. Uncle William used
-to tell Philip that when he was a curate his wife had known twelve songs
-by heart, which she could sing at a moment's notice whenever she was
-asked. She often sang still when there was a tea-party at the vicarage.
-There were few people whom the Careys cared to ask there, and their
-parties consisted always of the curate, Josiah Graves with his sister, Dr.
-Wigram and his wife. After tea Miss Graves played one or two of
-Mendelssohn's Songs without Words, and Mrs. Carey sang When the
-Swallows Homeward Fly, or Trot, Trot, My Pony.
-
-But the Careys did not give tea-parties often; the preparations upset
-them, and when their guests were gone they felt themselves exhausted. They
-preferred to have tea by themselves, and after tea they played backgammon.
-Mrs. Carey arranged that her husband should win, because he did not like
-losing. They had cold supper at eight. It was a scrappy meal because Mary
-Ann resented getting anything ready after tea, and Mrs. Carey helped to
-clear away. Mrs. Carey seldom ate more than bread and butter, with a
-little stewed fruit to follow, but the Vicar had a slice of cold meat.
-Immediately after supper Mrs. Carey rang the bell for prayers, and then
-Philip went to bed. He rebelled against being undressed by Mary Ann and
-after a while succeeded in establishing his right to dress and undress
-himself. At nine o'clock Mary Ann brought in the eggs and the plate. Mrs.
-Carey wrote the date on each egg and put the number down in a book. She
-then took the plate-basket on her arm and went upstairs. Mr. Carey
-continued to read one of his old books, but as the clock struck ten he got
-up, put out the lamps, and followed his wife to bed.
-
-When Philip arrived there was some difficulty in deciding on which evening
-he should have his bath. It was never easy to get plenty of hot water,
-since the kitchen boiler did not work, and it was impossible for two
-persons to have a bath on the same day. The only man who had a bathroom in
-Blackstable was Mr. Wilson, and it was thought ostentatious of him. Mary
-Ann had her bath in the kitchen on Monday night, because she liked to
-begin the week clean. Uncle William could not have his on Saturday,
-because he had a heavy day before him and he was always a little tired
-after a bath, so he had it on Friday. Mrs. Carey had hers on Thursday for
-the same reason. It looked as though Saturday were naturally indicated for
-Philip, but Mary Ann said she couldn't keep the fire up on Saturday night:
-what with all the cooking on Sunday, having to make pastry and she didn't
-know what all, she did not feel up to giving the boy his bath on Saturday
-night; and it was quite clear that he could not bath himself. Mrs. Carey
-was shy about bathing a boy, and of course the Vicar had his sermon. But
-the Vicar insisted that Philip should be clean and sweet for the lord's
-Day. Mary Ann said she would rather go than be put upon--and after
-eighteen years she didn't expect to have more work given her, and they
-might show some consideration--and Philip said he didn't want anyone to
-bath him, but could very well bath himself. This settled it. Mary Ann said
-she was quite sure he wouldn't bath himself properly, and rather than he
-should go dirty--and not because he was going into the presence of the
-Lord, but because she couldn't abide a boy who wasn't properly
-washed--she'd work herself to the bone even if it was Saturday night.
-
-
-
-VII
-
-
-Sunday was a day crowded with incident. Mr. Carey was accustomed to say
-that he was the only man in his parish who worked seven days a week.
-
-The household got up half an hour earlier than usual. No lying abed for a
-poor parson on the day of rest, Mr. Carey remarked as Mary Ann knocked at
-the door punctually at eight. It took Mrs. Carey longer to dress, and she
-got down to breakfast at nine, a little breathless, only just before her
-husband. Mr. Carey's boots stood in front of the fire to warm. Prayers
-were longer than usual, and the breakfast more substantial. After
-breakfast the Vicar cut thin slices of bread for the communion, and Philip
-was privileged to cut off the crust. He was sent to the study to fetch a
-marble paperweight, with which Mr. Carey pressed the bread till it was
-thin and pulpy, and then it was cut into small squares. The amount was
-regulated by the weather. On a very bad day few people came to church, and
-on a very fine one, though many came, few stayed for communion. There were
-most when it was dry enough to make the walk to church pleasant, but not
-so fine that people wanted to hurry away.
-
-Then Mrs. Carey brought the communion plate out of the safe, which stood
-in the pantry, and the Vicar polished it with a chamois leather. At ten
-the fly drove up, and Mr. Carey got into his boots. Mrs. Carey took
-several minutes to put on her bonnet, during which the Vicar, in a
-voluminous cloak, stood in the hall with just such an expression on his
-face as would have become an early Christian about to be led into the
-arena. It was extraordinary that after thirty years of marriage his wife
-could not be ready in time on Sunday morning. At last she came, in black
-satin; the Vicar did not like colours in a clergyman's wife at any time,
-but on Sundays he was determined that she should wear black; now and then,
-in conspiracy with Miss Graves, she ventured a white feather or a pink
-rose in her bonnet, but the Vicar insisted that it should disappear; he
-said he would not go to church with the scarlet woman: Mrs. Carey sighed
-as a woman but obeyed as a wife. They were about to step into the carriage
-when the Vicar remembered that no one had given him his egg. They knew
-that he must have an egg for his voice, there were two women in the house,
-and no one had the least regard for his comfort. Mrs. Carey scolded Mary
-Ann, and Mary Ann answered that she could not think of everything. She
-hurried away to fetch an egg, and Mrs. Carey beat it up in a glass of
-sherry. The Vicar swallowed it at a gulp. The communion plate was stowed
-in the carriage, and they set off.
-
-The fly came from The Red Lion and had a peculiar smell of stale straw.
-They drove with both windows closed so that the Vicar should not catch
-cold. The sexton was waiting at the porch to take the communion plate, and
-while the Vicar went to the vestry Mrs. Carey and Philip settled
-themselves in the vicarage pew. Mrs. Carey placed in front of her the
-sixpenny bit she was accustomed to put in the plate, and gave Philip
-threepence for the same purpose. The church filled up gradually and the
-service began.
-
-Philip grew bored during the sermon, but if he fidgetted Mrs. Carey put a
-gentle hand on his arm and looked at him reproachfully. He regained
-interest when the final hymn was sung and Mr. Graves passed round with the
-plate.
-
-When everyone had gone Mrs. Carey went into Miss Graves' pew to have a few
-words with her while they were waiting for the gentlemen, and Philip went
-to the vestry. His uncle, the curate, and Mr. Graves were still in their
-surplices. Mr. Carey gave him the remains of the consecrated bread and
-told him he might eat it. He had been accustomed to eat it himself, as it
-seemed blasphemous to throw it away, but Philip's keen appetite relieved
-him from the duty. Then they counted the money. It consisted of pennies,
-sixpences and threepenny bits. There were always two single shillings, one
-put in the plate by the Vicar and the other by Mr. Graves; and sometimes
-there was a florin. Mr. Graves told the Vicar who had given this. It was
-always a stranger to Blackstable, and Mr. Carey wondered who he was. But
-Miss Graves had observed the rash act and was able to tell Mrs. Carey that
-the stranger came from London, was married and had children. During the
-drive home Mrs. Carey passed the information on, and the Vicar made up his
-mind to call on him and ask for a subscription to the Additional Curates
-Society. Mr. Carey asked if Philip had behaved properly; and Mrs. Carey
-remarked that Mrs. Wigram had a new mantle, Mr. Cox was not in church, and
-somebody thought that Miss Phillips was engaged. When they reached the
-vicarage they all felt that they deserved a substantial dinner.
-
-When this was over Mrs. Carey went to her room to rest, and Mr. Carey lay
-down on the sofa in the drawing-room for forty winks.
-
-They had tea at five, and the Vicar ate an egg to support himself for
-evensong. Mrs. Carey did not go to this so that Mary Ann might, but she
-read the service through and the hymns. Mr. Carey walked to church in the
-evening, and Philip limped along by his side. The walk through the
-darkness along the country road strangely impressed him, and the church
-with all its lights in the distance, coming gradually nearer, seemed very
-friendly. At first he was shy with his uncle, but little by little grew
-used to him, and he would slip his hand in his uncle's and walk more
-easily for the feeling of protection.
-
-They had supper when they got home. Mr. Carey's slippers were waiting for
-him on a footstool in front of the fire and by their side Philip's, one
-the shoe of a small boy, the other misshapen and odd. He was dreadfully
-tired when he went up to bed, and he did not resist when Mary Ann
-undressed him. She kissed him after she tucked him up, and he began to
-love her.
-
-
-
-VIII
-
-
-Philip had led always the solitary life of an only child, and his
-loneliness at the vicarage was no greater than it had been when his mother
-lived. He made friends with Mary Ann. She was a chubby little person of
-thirty-five, the daughter of a fisherman, and had come to the vicarage at
-eighteen; it was her first place and she had no intention of leaving it;
-but she held a possible marriage as a rod over the timid heads of her
-master and mistress. Her father and mother lived in a little house off
-Harbour Street, and she went to see them on her evenings out. Her stories
-of the sea touched Philip's imagination, and the narrow alleys round the
-harbour grew rich with the romance which his young fancy lent them. One
-evening he asked whether he might go home with her; but his aunt was
-afraid that he might catch something, and his uncle said that evil
-communications corrupted good manners. He disliked the fisher folk, who
-were rough, uncouth, and went to chapel. But Philip was more comfortable
-in the kitchen than in the dining-room, and, whenever he could, he took
-his toys and played there. His aunt was not sorry. She did not like
-disorder, and though she recognised that boys must be expected to be
-untidy she preferred that he should make a mess in the kitchen. If he
-fidgeted his uncle was apt to grow restless and say it was high time he
-went to school. Mrs. Carey thought Philip very young for this, and her
-heart went out to the motherless child; but her attempts to gain his
-affection were awkward, and the boy, feeling shy, received her
-demonstrations with so much sullenness that she was mortified. Sometimes
-she heard his shrill voice raised in laughter in the kitchen, but when she
-went in, he grew suddenly silent, and he flushed darkly when Mary Ann
-explained the joke. Mrs. Carey could not see anything amusing in what she
-heard, and she smiled with constraint.
-
-"He seems happier with Mary Ann than with us, William," she said, when she
-returned to her sewing.
-
-"One can see he's been very badly brought up. He wants licking into
-shape."
-
-On the second Sunday after Philip arrived an unlucky incident occurred.
-Mr. Carey had retired as usual after dinner for a little snooze in the
-drawing-room, but he was in an irritable mood and could not sleep. Josiah
-Graves that morning had objected strongly to some candlesticks with which
-the Vicar had adorned the altar. He had bought them second-hand in
-Tercanbury, and he thought they looked very well. But Josiah Graves said
-they were popish. This was a taunt that always aroused the Vicar. He had
-been at Oxford during the movement which ended in the secession from the
-Established Church of Edward Manning, and he felt a certain sympathy for
-the Church of Rome. He would willingly have made the service more ornate
-than had been usual in the low-church parish of Blackstable, and in his
-secret soul he yearned for processions and lighted candles. He drew the
-line at incense. He hated the word protestant. He called himself a
-Catholic. He was accustomed to say that Papists required an epithet, they
-were Roman Catholic; but the Church of England was Catholic in the best,
-the fullest, and the noblest sense of the term. He was pleased to think
-that his shaven face gave him the look of a priest, and in his youth he
-had possessed an ascetic air which added to the impression. He often
-related that on one of his holidays in Boulogne, one of those holidays
-upon which his wife for economy's sake did not accompany him, when he was
-sitting in a church, the cure had come up to him and invited him to
-preach a sermon. He dismissed his curates when they married, having
-decided views on the celibacy of the unbeneficed clergy. But when at an
-election the Liberals had written on his garden fence in large blue
-letters: This way to Rome, he had been very angry, and threatened to
-prosecute the leaders of the Liberal party in Blackstable. He made up his
-mind now that nothing Josiah Graves said would induce him to remove the
-candlesticks from the altar, and he muttered Bismarck to himself once or
-twice irritably.
-
-Suddenly he heard an unexpected noise. He pulled the handkerchief off his
-face, got up from the sofa on which he was lying, and went into the
-dining-room. Philip was seated on the table with all his bricks around
-him. He had built a monstrous castle, and some defect in the foundation
-had just brought the structure down in noisy ruin.
-
-"What are you doing with those bricks, Philip? You know you're not allowed
-to play games on Sunday."
-
-Philip stared at him for a moment with frightened eyes, and, as his habit
-was, flushed deeply.
-
-"I always used to play at home," he answered.
-
-"I'm sure your dear mamma never allowed you to do such a wicked thing as
-that."
-
-Philip did not know it was wicked; but if it was, he did not wish it to be
-supposed that his mother had consented to it. He hung his head and did not
-answer.
-
-"Don't you know it's very, very wicked to play on Sunday? What d'you
-suppose it's called the day of rest for? You're going to church tonight,
-and how can you face your Maker when you've been breaking one of His laws
-in the afternoon?"
-
-Mr. Carey told him to put the bricks away at once, and stood over him
-while Philip did so.
-
-"You're a very naughty boy," he repeated. "Think of the grief you're
-causing your poor mother in heaven."
-
-Philip felt inclined to cry, but he had an instinctive disinclination to
-letting other people see his tears, and he clenched his teeth to prevent
-the sobs from escaping. Mr. Carey sat down in his arm-chair and began to
-turn over the pages of a book. Philip stood at the window. The vicarage
-was set back from the highroad to Tercanbury, and from the dining-room one
-saw a semicircular strip of lawn and then as far as the horizon green
-fields. Sheep were grazing in them. The sky was forlorn and gray. Philip
-felt infinitely unhappy.
-
-Presently Mary Ann came in to lay the tea, and Aunt Louisa descended the
-stairs.
-
-"Have you had a nice little nap, William?" she asked.
-
-"No," he answered. "Philip made so much noise that I couldn't sleep a
-wink."
-
-This was not quite accurate, for he had been kept awake by his own
-thoughts; and Philip, listening sullenly, reflected that he had only made
-a noise once, and there was no reason why his uncle should not have slept
-before or after. When Mrs. Carey asked for an explanation the Vicar
-narrated the facts.
-
-"He hasn't even said he was sorry," he finished.
-
-"Oh, Philip, I'm sure you're sorry," said Mrs. Carey, anxious that the
-child should not seem wickeder to his uncle than need be.
-
-Philip did not reply. He went on munching his bread and butter. He did not
-know what power it was in him that prevented him from making any
-expression of regret. He felt his ears tingling, he was a little inclined
-to cry, but no word would issue from his lips.
-
-"You needn't make it worse by sulking," said Mr. Carey.
-
-Tea was finished in silence. Mrs. Carey looked at Philip surreptitiously
-now and then, but the Vicar elaborately ignored him. When Philip saw his
-uncle go upstairs to get ready for church he went into the hall and got
-his hat and coat, but when the Vicar came downstairs and saw him, he said:
-
-"I don't wish you to go to church tonight, Philip. I don't think you're in
-a proper frame of mind to enter the House of God."
-
-Philip did not say a word. He felt it was a deep humiliation that was
-placed upon him, and his cheeks reddened. He stood silently watching his
-uncle put on his broad hat and his voluminous cloak. Mrs. Carey as usual
-went to the door to see him off. Then she turned to Philip.
-
-"Never mind, Philip, you won't be a naughty boy next Sunday, will you, and
-then your uncle will take you to church with him in the evening."
-
-She took off his hat and coat, and led him into the dining-room.
-
-"Shall you and I read the service together, Philip, and we'll sing the
-hymns at the harmonium. Would you like that?"
-
-Philip shook his head decidedly. Mrs. Carey was taken aback. If he would
-not read the evening service with her she did not know what to do with
-him.
-
-"Then what would you like to do until your uncle comes back?" she asked
-helplessly.
-
-Philip broke his silence at last.
-
-"I want to be left alone," he said.
-
-"Philip, how can you say anything so unkind? Don't you know that your
-uncle and I only want your good? Don't you love me at all?"
-
-"I hate you. I wish you was dead."
-
-Mrs. Carey gasped. He said the words so savagely that it gave her quite a
-start. She had nothing to say. She sat down in her husband's chair; and as
-she thought of her desire to love the friendless, crippled boy and her
-eager wish that he should love her--she was a barren woman and, even
-though it was clearly God's will that she should be childless, she could
-scarcely bear to look at little children sometimes, her heart ached
-so--the tears rose to her eyes and one by one, slowly, rolled down her
-cheeks. Philip watched her in amazement. She took out her handkerchief,
-and now she cried without restraint. Suddenly Philip realised that she was
-crying because of what he had said, and he was sorry. He went up to her
-silently and kissed her. It was the first kiss he had ever given her
-without being asked. And the poor lady, so small in her black satin,
-shrivelled up and sallow, with her funny corkscrew curls, took the little
-boy on her lap and put her arms around him and wept as though her heart
-would break. But her tears were partly tears of happiness, for she felt
-that the strangeness between them was gone. She loved him now with a new
-love because he had made her suffer.
-
-
-
-IX
-
-
-On the following Sunday, when the Vicar was making his preparations to go
-into the drawing-room for his nap--all the actions of his life were
-conducted with ceremony--and Mrs. Carey was about to go upstairs, Philip
-asked:
-
-"What shall I do if I'm not allowed to play?"
-
-"Can't you sit still for once and be quiet?"
-
-"I can't sit still till tea-time."
-
-Mr. Carey looked out of the window, but it was cold and raw, and he could
-not suggest that Philip should go into the garden.
-
-"I know what you can do. You can learn by heart the collect for the day."
-
-He took the prayer-book which was used for prayers from the harmonium, and
-turned the pages till he came to the place he wanted.
-
-"It's not a long one. If you can say it without a mistake when I come in
-to tea you shall have the top of my egg."
-
-Mrs. Carey drew up Philip's chair to the dining-room table--they had
-bought him a high chair by now--and placed the book in front of him.
-
-"The devil finds work for idle hands to do," said Mr. Carey.
-
-He put some more coals on the fire so that there should be a cheerful
-blaze when he came in to tea, and went into the drawing-room. He loosened
-his collar, arranged the cushions, and settled himself comfortably on the
-sofa. But thinking the drawing-room a little chilly, Mrs. Carey brought
-him a rug from the hall; she put it over his legs and tucked it round his
-feet. She drew the blinds so that the light should not offend his eyes,
-and since he had closed them already went out of the room on tiptoe. The
-Vicar was at peace with himself today, and in ten minutes he was asleep.
-He snored softly.
-
-It was the Sixth Sunday after Epiphany, and the collect began with the
-words: O God, whose blessed Son was manifested that he might destroy the
-works of the devil, and make us the sons of God, and heirs of Eternal
-life. Philip read it through. He could make no sense of it. He began
-saying the words aloud to himself, but many of them were unknown to him,
-and the construction of the sentence was strange. He could not get more
-than two lines in his head. And his attention was constantly wandering:
-there were fruit trees trained on the walls of the vicarage, and a long
-twig beat now and then against the windowpane; sheep grazed stolidly in
-the field beyond the garden. It seemed as though there were knots inside
-his brain. Then panic seized him that he would not know the words by
-tea-time, and he kept on whispering them to himself quickly; he did not
-try to understand, but merely to get them parrot-like into his memory.
-
-Mrs. Carey could not sleep that afternoon, and by four o'clock she was so
-wide awake that she came downstairs. She thought she would hear Philip his
-collect so that he should make no mistakes when he said it to his uncle.
-His uncle then would be pleased; he would see that the boy's heart was in
-the right place. But when Mrs. Carey came to the dining-room and was about
-to go in, she heard a sound that made her stop suddenly. Her heart gave a
-little jump. She turned away and quietly slipped out of the front-door.
-She walked round the house till she came to the dining-room window and
-then cautiously looked in. Philip was still sitting on the chair she had
-put him in, but his head was on the table buried in his arms, and he was
-sobbing desperately. She saw the convulsive movement of his shoulders.
-Mrs. Carey was frightened. A thing that had always struck her about the
-child was that he seemed so collected. She had never seen him cry. And now
-she realised that his calmness was some instinctive shame of showing his
-fillings: he hid himself to weep.
-
-Without thinking that her husband disliked being wakened suddenly, she
-burst into the drawing-room.
-
-"William, William," she said. "The boy's crying as though his heart would
-break."
-
-Mr. Carey sat up and disentangled himself from the rug about his legs.
-
-"What's he got to cry about?"
-
-"I don't know.... Oh, William, we can't let the boy be unhappy. D'you
-think it's our fault? If we'd had children we'd have known what to do."
-
-Mr. Carey looked at her in perplexity. He felt extraordinarily helpless.
-
-"He can't be crying because I gave him the collect to learn. It's not more
-than ten lines."
-
-"Don't you think I might take him some picture books to look at, William?
-There are some of the Holy Land. There couldn't be anything wrong in
-that."
-
-"Very well, I don't mind."
-
-Mrs. Carey went into the study. To collect books was Mr. Carey's only
-passion, and he never went into Tercanbury without spending an hour or two
-in the second-hand shop; he always brought back four or five musty
-volumes. He never read them, for he had long lost the habit of reading,
-but he liked to turn the pages, look at the illustrations if they were
-illustrated, and mend the bindings. He welcomed wet days because on them
-he could stay at home without pangs of conscience and spend the afternoon
-with white of egg and a glue-pot, patching up the Russia leather of some
-battered quarto. He had many volumes of old travels, with steel
-engravings, and Mrs. Carey quickly found two which described Palestine.
-She coughed elaborately at the door so that Philip should have time to
-compose himself, she felt that he would be humiliated if she came upon him
-in the midst of his tears, then she rattled the door handle. When she went
-in Philip was poring over the prayer-book, hiding his eyes with his hands
-so that she might not see he had been crying.
-
-"Do you know the collect yet?" she said.
-
-He did not answer for a moment, and she felt that he did not trust his
-voice. She was oddly embarrassed.
-
-"I can't learn it by heart," he said at last, with a gasp.
-
-"Oh, well, never mind," she said. "You needn't. I've got some picture
-books for you to look at. Come and sit on my lap, and we'll look at them
-together."
-
-Philip slipped off his chair and limped over to her. He looked down so
-that she should not see his eyes. She put her arms round him.
-
-"Look," she said, "that's the place where our blessed Lord was born."
-
-She showed him an Eastern town with flat roofs and cupolas and minarets.
-In the foreground was a group of palm-trees, and under them were resting
-two Arabs and some camels. Philip passed his hand over the picture as if
-he wanted to feel the houses and the loose habiliments of the nomads.
-
-"Read what it says," he asked.
-
-Mrs. Carey in her even voice read the opposite page. It was a romantic
-narrative of some Eastern traveller of the thirties, pompous maybe, but
-fragrant with the emotion with which the East came to the generation that
-followed Byron and Chateaubriand. In a moment or two Philip interrupted
-her.
-
-"I want to see another picture."
-
-When Mary Ann came in and Mrs. Carey rose to help her lay the cloth.
-Philip took the book in his hands and hurried through the illustrations.
-It was with difficulty that his aunt induced him to put the book down for
-tea. He had forgotten his horrible struggle to get the collect by heart;
-he had forgotten his tears. Next day it was raining, and he asked for the
-book again. Mrs. Carey gave it him joyfully. Talking over his future with
-her husband she had found that both desired him to take orders, and this
-eagerness for the book which described places hallowed by the presence of
-Jesus seemed a good sign. It looked as though the boy's mind addressed
-itself naturally to holy things. But in a day or two he asked for more
-books. Mr. Carey took him into his study, showed him the shelf in which he
-kept illustrated works, and chose for him one that dealt with Rome. Philip
-took it greedily. The pictures led him to a new amusement. He began to
-read the page before and the page after each engraving to find out what it
-was about, and soon he lost all interest in his toys.
-
-Then, when no one was near, he took out books for himself; and perhaps
-because the first impression on his mind was made by an Eastern town, he
-found his chief amusement in those which described the Levant. His heart
-beat with excitement at the pictures of mosques and rich palaces; but
-there was one, in a book on Constantinople, which peculiarly stirred his
-imagination. It was called the Hall of the Thousand Columns. It was a
-Byzantine cistern, which the popular fancy had endowed with fantastic
-vastness; and the legend which he read told that a boat was always moored
-at the entrance to tempt the unwary, but no traveller venturing into the
-darkness had ever been seen again. And Philip wondered whether the boat
-went on for ever through one pillared alley after another or came at last
-to some strange mansion.
-
-One day a good fortune befell him, for he hit upon Lane's translation of
-The Thousand Nights and a Night. He was captured first by the
-illustrations, and then he began to read, to start with, the stories that
-dealt with magic, and then the others; and those he liked he read again
-and again. He could think of nothing else. He forgot the life about him.
-He had to be called two or three times before he would come to his dinner.
-Insensibly he formed the most delightful habit in the world, the habit of
-reading: he did not know that thus he was providing himself with a refuge
-from all the distress of life; he did not know either that he was creating
-for himself an unreal world which would make the real world of every day
-a source of bitter disappointment. Presently he began to read other
-things. His brain was precocious. His uncle and aunt, seeing that he
-occupied himself and neither worried nor made a noise, ceased to trouble
-themselves about him. Mr. Carey had so many books that he did not know
-them, and as he read little he forgot the odd lots he had bought at one
-time and another because they were cheap. Haphazard among the sermons and
-homilies, the travels, the lives of the Saints, the Fathers, the histories
-of the church, were old-fashioned novels; and these Philip at last
-discovered. He chose them by their titles, and the first he read was The
-Lancashire Witches, and then he read The Admirable Crichton, and then
-many more. Whenever he started a book with two solitary travellers riding
-along the brink of a desperate ravine he knew he was safe.
-
-The summer was come now, and the gardener, an old sailor, made him a
-hammock and fixed it up for him in the branches of a weeping willow. And
-here for long hours he lay, hidden from anyone who might come to the
-vicarage, reading, reading passionately. Time passed and it was July;
-August came: on Sundays the church was crowded with strangers, and the
-collection at the offertory often amounted to two pounds. Neither the
-Vicar nor Mrs. Carey went out of the garden much during this period; for
-they disliked strange faces, and they looked upon the visitors from London
-with aversion. The house opposite was taken for six weeks by a gentleman
-who had two little boys, and he sent in to ask if Philip would like to go
-and play with them; but Mrs. Carey returned a polite refusal. She was
-afraid that Philip would be corrupted by little boys from London. He was
-going to be a clergyman, and it was necessary that he should be preserved
-from contamination. She liked to see in him an infant Samuel.
-
-
-
-X
-
-
-The Careys made up their minds to send Philip to King's School at
-Tercanbury. The neighbouring clergy sent their sons there. It was united
-by long tradition to the Cathedral: its headmaster was an honorary Canon,
-and a past headmaster was the Archdeacon. Boys were encouraged there to
-aspire to Holy Orders, and the education was such as might prepare an
-honest lad to spend his life in God's service. A preparatory school was
-attached to it, and to this it was arranged that Philip should go. Mr.
-Carey took him into Tercanbury one Thursday afternoon towards the end of
-September. All day Philip had been excited and rather frightened. He knew
-little of school life but what he had read in the stories of The Boy's
-Own Paper. He had also read Eric, or Little by Little.
-
-When they got out of the train at Tercanbury, Philip felt sick with
-apprehension, and during the drive in to the town sat pale and silent. The
-high brick wall in front of the school gave it the look of a prison. There
-was a little door in it, which opened on their ringing; and a clumsy,
-untidy man came out and fetched Philip's tin trunk and his play-box. They
-were shown into the drawing-room; it was filled with massive, ugly
-furniture, and the chairs of the suite were placed round the walls with a
-forbidding rigidity. They waited for the headmaster.
-
-"What's Mr. Watson like?" asked Philip, after a while.
-
-"You'll see for yourself."
-
-There was another pause. Mr. Carey wondered why the headmaster did not
-come. Presently Philip made an effort and spoke again.
-
-"Tell him I've got a club-foot," he said.
-
-Before Mr. Carey could speak the door burst open and Mr. Watson swept into
-the room. To Philip he seemed gigantic. He was a man of over six feet
-high, and broad, with enormous hands and a great red beard; he talked
-loudly in a jovial manner; but his aggressive cheerfulness struck terror
-in Philip's heart. He shook hands with Mr. Carey, and then took Philip's
-small hand in his.
-
-"Well, young fellow, are you glad to come to school?" he shouted.
-
-Philip reddened and found no word to answer.
-
-"How old are you?"
-
-"Nine," said Philip.
-
-"You must say sir," said his uncle.
-
-"I expect you've got a good lot to learn," the headmaster bellowed
-cheerily.
-
-To give the boy confidence he began to tickle him with rough fingers.
-Philip, feeling shy and uncomfortable, squirmed under his touch.
-
-"I've put him in the small dormitory for the present.... You'll like that,
-won't you?" he added to Philip. "Only eight of you in there. You won't
-feel so strange."
-
-Then the door opened, and Mrs. Watson came in. She was a dark woman with
-black hair, neatly parted in the middle. She had curiously thick lips and
-a small round nose. Her eyes were large and black. There was a singular
-coldness in her appearance. She seldom spoke and smiled more seldom still.
-Her husband introduced Mr. Carey to her, and then gave Philip a friendly
-push towards her.
-
-"This is a new boy, Helen, His name's Carey."
-
-Without a word she shook hands with Philip and then sat down, not
-speaking, while the headmaster asked Mr. Carey how much Philip knew and
-what books he had been working with. The Vicar of Blackstable was a little
-embarrassed by Mr. Watson's boisterous heartiness, and in a moment or two
-got up.
-
-"I think I'd better leave Philip with you now."
-
-"That's all right," said Mr. Watson. "He'll be safe with me. He'll get on
-like a house on fire. Won't you, young fellow?"
-
-Without waiting for an answer from Philip the big man burst into a great
-bellow of laughter. Mr. Carey kissed Philip on the forehead and went away.
-
-"Come along, young fellow," shouted Mr. Watson. "I'll show you the
-school-room."
-
-He swept out of the drawing-room with giant strides, and Philip hurriedly
-limped behind him. He was taken into a long, bare room with two tables
-that ran along its whole length; on each side of them were wooden forms.
-
-"Nobody much here yet," said Mr. Watson. "I'll just show you the
-playground, and then I'll leave you to shift for yourself."
-
-Mr. Watson led the way. Philip found himself in a large play-ground with
-high brick walls on three sides of it. On the fourth side was an iron
-railing through which you saw a vast lawn and beyond this some of the
-buildings of King's School. One small boy was wandering disconsolately,
-kicking up the gravel as he walked.
-
-"Hulloa, Venning," shouted Mr. Watson. "When did you turn up?"
-
-The small boy came forward and shook hands.
-
-"Here's a new boy. He's older and bigger than you, so don't you bully
-him."
-
-The headmaster glared amicably at the two children, filling them with fear
-by the roar of his voice, and then with a guffaw left them.
-
-"What's your name?"
-
-"Carey."
-
-"What's your father?"
-
-"He's dead."
-
-"Oh! Does your mother wash?"
-
-"My mother's dead, too."
-
-Philip thought this answer would cause the boy a certain awkwardness, but
-Venning was not to be turned from his facetiousness for so little.
-
-"Well, did she wash?" he went on.
-
-"Yes," said Philip indignantly.
-
-"She was a washerwoman then?"
-
-"No, she wasn't."
-
-"Then she didn't wash."
-
-The little boy crowed with delight at the success of his dialectic. Then
-he caught sight of Philip's feet.
-
-"What's the matter with your foot?"
-
-Philip instinctively tried to withdraw it from sight. He hid it behind the
-one which was whole.
-
-"I've got a club-foot," he answered.
-
-"How did you get it?"
-
-"I've always had it."
-
-"Let's have a look."
-
-"No."
-
-"Don't then."
-
-The little boy accompanied the words with a sharp kick on Philip's shin,
-which Philip did not expect and thus could not guard against. The pain was
-so great that it made him gasp, but greater than the pain was the
-surprise. He did not know why Venning kicked him. He had not the presence
-of mind to give him a black eye. Besides, the boy was smaller than he, and
-he had read in The Boy's Own Paper that it was a mean thing to hit
-anyone smaller than yourself. While Philip was nursing his shin a third
-boy appeared, and his tormentor left him. In a little while he noticed
-that the pair were talking about him, and he felt they were looking at his
-feet. He grew hot and uncomfortable.
-
-But others arrived, a dozen together, and then more, and they began to
-talk about their doings during the holidays, where they had been, and what
-wonderful cricket they had played. A few new boys appeared, and with these
-presently Philip found himself talking. He was shy and nervous. He was
-anxious to make himself pleasant, but he could not think of anything to
-say. He was asked a great many questions and answered them all quite
-willingly. One boy asked him whether he could play cricket.
-
-"No," answered Philip. "I've got a club-foot."
-
-The boy looked down quickly and reddened. Philip saw that he felt he had
-asked an unseemly question. He was too shy to apologise and looked at
-Philip awkwardly.
-
-
-
-XI
-
-
-Next morning when the clanging of a bell awoke Philip he looked round his
-cubicle in astonishment. Then a voice sang out, and he remembered where he
-was.
-
-"Are you awake, Singer?"
-
-The partitions of the cubicle were of polished pitch-pine, and there was
-a green curtain in front. In those days there was little thought of
-ventilation, and the windows were closed except when the dormitory was
-aired in the morning.
-
-Philip got up and knelt down to say his prayers. It was a cold morning,
-and he shivered a little; but he had been taught by his uncle that his
-prayers were more acceptable to God if he said them in his nightshirt than
-if he waited till he was dressed. This did not surprise him, for he was
-beginning to realise that he was the creature of a God who appreciated the
-discomfort of his worshippers. Then he washed. There were two baths for
-the fifty boarders, and each boy had a bath once a week. The rest of his
-washing was done in a small basin on a wash-stand, which with the bed and
-a chair, made up the furniture of each cubicle. The boys chatted gaily
-while they dressed. Philip was all ears. Then another bell sounded, and
-they ran downstairs. They took their seats on the forms on each side of
-the two long tables in the school-room; and Mr. Watson, followed by his
-wife and the servants, came in and sat down. Mr. Watson read prayers in an
-impressive manner, and the supplications thundered out in his loud voice
-as though they were threats personally addressed to each boy. Philip
-listened with anxiety. Then Mr. Watson read a chapter from the Bible, and
-the servants trooped out. In a moment the untidy youth brought in two
-large pots of tea and on a second journey immense dishes of bread and
-butter.
-
-Philip had a squeamish appetite, and the thick slabs of poor butter on the
-bread turned his stomach, but he saw other boys scraping it off and
-followed their example. They all had potted meats and such like, which
-they had brought in their play-boxes; and some had 'extras,' eggs or
-bacon, upon which Mr. Watson made a profit. When he had asked Mr. Carey
-whether Philip was to have these, Mr. Carey replied that he did not think
-boys should be spoilt. Mr. Watson quite agreed with him--he considered
-nothing was better than bread and butter for growing lads--but some
-parents, unduly pampering their offspring, insisted on it.
-
-Philip noticed that 'extras' gave boys a certain consideration and made up
-his mind, when he wrote to Aunt Louisa, to ask for them.
-
-After breakfast the boys wandered out into the play-ground. Here the
-day-boys were gradually assembling. They were sons of the local clergy, of
-the officers at the Depot, and of such manufacturers or men of business as
-the old town possessed. Presently a bell rang, and they all trooped into
-school. This consisted of a large, long room at opposite ends of which two
-under-masters conducted the second and third forms, and of a smaller one,
-leading out of it, used by Mr. Watson, who taught the first form. To
-attach the preparatory to the senior school these three classes were known
-officially, on speech days and in reports, as upper, middle, and lower
-second. Philip was put in the last. The master, a red-faced man with a
-pleasant voice, was called Rice; he had a jolly manner with boys, and the
-time passed quickly. Philip was surprised when it was a quarter to eleven
-and they were let out for ten minutes' rest.
-
-The whole school rushed noisily into the play-ground. The new boys were
-told to go into the middle, while the others stationed themselves along
-opposite walls. They began to play Pig in the Middle. The old boys ran
-from wall to wall while the new boys tried to catch them: when one was
-seized and the mystic words said--one, two, three, and a pig for me--he
-became a prisoner and, turning sides, helped to catch those who were still
-free. Philip saw a boy running past and tried to catch him, but his limp
-gave him no chance; and the runners, taking their opportunity, made
-straight for the ground he covered. Then one of them had the brilliant
-idea of imitating Philip's clumsy run. Other boys saw it and began to
-laugh; then they all copied the first; and they ran round Philip, limping
-grotesquely, screaming in their treble voices with shrill laughter. They
-lost their heads with the delight of their new amusement, and choked with
-helpless merriment. One of them tripped Philip up and he fell, heavily as
-he always fell, and cut his knee. They laughed all the louder when he got
-up. A boy pushed him from behind, and he would have fallen again if
-another had not caught him. The game was forgotten in the entertainment of
-Philip's deformity. One of them invented an odd, rolling limp that struck
-the rest as supremely ridiculous, and several of the boys lay down on the
-ground and rolled about in laughter: Philip was completely scared. He
-could not make out why they were laughing at him. His heart beat so that
-he could hardly breathe, and he was more frightened than he had ever been
-in his life. He stood still stupidly while the boys ran round him,
-mimicking and laughing; they shouted to him to try and catch them; but he
-did not move. He did not want them to see him run any more. He was using
-all his strength to prevent himself from crying.
-
-Suddenly the bell rang, and they all trooped back to school. Philip's knee
-was bleeding, and he was dusty and dishevelled. For some minutes Mr. Rice
-could not control his form. They were excited still by the strange
-novelty, and Philip saw one or two of them furtively looking down at his
-feet. He tucked them under the bench.
-
-In the afternoon they went up to play football, but Mr. Watson stopped
-Philip on the way out after dinner.
-
-"I suppose you can't play football, Carey?" he asked him.
-
-Philip blushed self-consciously.
-
-"No, sir."
-
-"Very well. You'd better go up to the field. You can walk as far as that,
-can't you?"
-
-Philip had no idea where the field was, but he answered all the same.
-
-"Yes, sir."
-
-The boys went in charge of Mr. Rice, who glanced at Philip and seeing he
-had not changed, asked why he was not going to play.
-
-"Mr. Watson said I needn't, sir," said Philip.
-
-"Why?"
-
-There were boys all round him, looking at him curiously, and a feeling of
-shame came over Philip. He looked down without answering. Others gave the
-reply.
-
-"He's got a club-foot, sir."
-
-"Oh, I see."
-
-Mr. Rice was quite young; he had only taken his degree a year before; and
-he was suddenly embarrassed. His instinct was to beg the boy's pardon, but
-he was too shy to do so. He made his voice gruff and loud.
-
-"Now then, you boys, what are you waiting about for? Get on with you."
-
-Some of them had already started and those that were left now set off, in
-groups of two or three.
-
-"You'd better come along with me, Carey," said the master "You don't know
-the way, do you?"
-
-Philip guessed the kindness, and a sob came to his throat.
-
-"I can't go very fast, sir."
-
-"Then I'll go very slow," said the master, with a smile.
-
-Philip's heart went out to the red-faced, commonplace young man who said
-a gentle word to him. He suddenly felt less unhappy.
-
-But at night when they went up to bed and were undressing, the boy who was
-called Singer came out of his cubicle and put his head in Philip's.
-
-"I say, let's look at your foot," he said.
-
-"No," answered Philip.
-
-He jumped into bed quickly.
-
-"Don't say no to me," said Singer. "Come on, Mason."
-
-The boy in the next cubicle was looking round the corner, and at the words
-he slipped in. They made for Philip and tried to tear the bed-clothes off
-him, but he held them tightly.
-
-"Why can't you leave me alone?" he cried.
-
-Singer seized a brush and with the back of it beat Philip's hands clenched
-on the blanket. Philip cried out.
-
-"Why don't you show us your foot quietly?"
-
-"I won't."
-
-In desperation Philip clenched his fist and hit the boy who tormented him,
-but he was at a disadvantage, and the boy seized his arm. He began to turn
-it.
-
-"Oh, don't, don't," said Philip. "You'll break my arm."
-
-"Stop still then and put out your foot."
-
-Philip gave a sob and a gasp. The boy gave the arm another wrench. The
-pain was unendurable.
-
-"All right. I'll do it," said Philip.
-
-He put out his foot. Singer still kept his hand on Philip's wrist. He
-looked curiously at the deformity.
-
-"Isn't it beastly?" said Mason.
-
-Another came in and looked too.
-
-"Ugh," he said, in disgust.
-
-"My word, it is rum," said Singer, making a face. "Is it hard?"
-
-He touched it with the tip of his forefinger, cautiously, as though it
<TRUNCATED>
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/orders.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/orders.txt b/crunch/src/it/resources/orders.txt
deleted file mode 100644
index 2f1383f..0000000
--- a/crunch/src/it/resources/orders.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-222|Toilet plunger
-333|Toilet brush
-222|Toilet paper
-111|Corn flakes
\ No newline at end of file
[34/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchControlledJob.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchControlledJob.java b/crunch-core/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchControlledJob.java
new file mode 100644
index 0000000..93926c1
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchControlledJob.java
@@ -0,0 +1,325 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.hadoop.mapreduce.lib.jobcontrol;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.impl.mr.run.RuntimeParameters;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobID;
+import org.apache.hadoop.util.StringUtils;
+
+import com.google.common.base.Objects;
+import com.google.common.collect.Lists;
+
+/**
+ * This class encapsulates a MapReduce job and its dependency. It monitors the
+ * states of the depending jobs and updates the state of this job. A job starts
+ * in the WAITING state. If it does not have any depending jobs, or all of the
+ * depending jobs are in SUCCEEDED state, then the job state will become READY. If
+ * any depending jobs fail, the job will fail too. When in READY state, the job
+ * can be submitted to Hadoop for execution, with the state changing into
+ * RUNNING state. From RUNNING state, the job can get into SUCCEEDED or FAILED
+ * state, depending the status of the job execution.
+ */
+public class CrunchControlledJob {
+
+ // A job will be in one of the following states
+ public static enum State {
+ SUCCESS, WAITING, RUNNING, READY, FAILED, DEPENDENT_FAILED
+ };
+
+ public static interface Hook {
+ public void run() throws IOException;
+ }
+
+ private static final Log LOG = LogFactory.getLog(CrunchControlledJob.class);
+
+ private final int jobID;
+ private final Job job; // mapreduce job to be executed.
+ // the jobs the current job depends on
+ private final List<CrunchControlledJob> dependingJobs;
+ private final Hook prepareHook;
+ private final Hook completionHook;
+ private State state;
+ // some info for human consumption, e.g. the reason why the job failed
+ private String message;
+ private String lastKnownProgress;
+
+ /**
+ * Construct a job.
+ *
+ * @param jobID
+ * an ID used to match with its {@link org.apache.crunch.impl.mr.plan.JobPrototype}.
+ * @param job
+ * a mapreduce job to be executed.
+ * @param prepareHook
+ * a piece of code that will run before this job is submitted.
+ * @param completionHook
+ * a piece of code that will run after this job gets completed.
+ */
+ public CrunchControlledJob(int jobID, Job job, Hook prepareHook, Hook completionHook) {
+ this.jobID = jobID;
+ this.job = job;
+ this.dependingJobs = Lists.newArrayList();
+ this.prepareHook = prepareHook;
+ this.completionHook = completionHook;
+ this.state = State.WAITING;
+ this.message = "just initialized";
+ }
+
+ @Override
+ public String toString() {
+ StringBuffer sb = new StringBuffer();
+ sb.append("job name:\t").append(this.job.getJobName()).append("\n");
+ sb.append("job id:\t").append(this.jobID).append("\n");
+ sb.append("job state:\t").append(this.state).append("\n");
+ sb.append("job mapred id:\t").append(this.job.getJobID()).append("\n");
+ sb.append("job message:\t").append(this.message).append("\n");
+
+ if (this.dependingJobs == null || this.dependingJobs.size() == 0) {
+ sb.append("job has no depending job:\t").append("\n");
+ } else {
+ sb.append("job has ").append(this.dependingJobs.size())
+ .append(" dependeng jobs:\n");
+ for (int i = 0; i < this.dependingJobs.size(); i++) {
+ sb.append("\t depending job ").append(i).append(":\t");
+ sb.append((this.dependingJobs.get(i)).getJobName()).append("\n");
+ }
+ }
+ return sb.toString();
+ }
+
+ /**
+ * @return the job name of this job
+ */
+ public String getJobName() {
+ return job.getJobName();
+ }
+
+ /**
+ * Set the job name for this job.
+ *
+ * @param jobName
+ * the job name
+ */
+ public void setJobName(String jobName) {
+ job.setJobName(jobName);
+ }
+
+ /**
+ * @return the job ID of this job
+ */
+ public int getJobID() {
+ return this.jobID;
+ }
+
+ /**
+ * @return the mapred ID of this job as assigned by the mapred framework.
+ */
+ public JobID getMapredJobID() {
+ return this.job.getJobID();
+ }
+
+ /**
+ * @return the mapreduce job
+ */
+ public synchronized Job getJob() {
+ return this.job;
+ }
+
+ /**
+ * @return the state of this job
+ */
+ public synchronized State getJobState() {
+ return this.state;
+ }
+
+ /**
+ * Set the state for this job.
+ *
+ * @param state
+ * the new state for this job.
+ */
+ protected synchronized void setJobState(State state) {
+ this.state = state;
+ }
+
+ /**
+ * @return the message of this job
+ */
+ public synchronized String getMessage() {
+ return this.message;
+ }
+
+ /**
+ * Set the message for this job.
+ *
+ * @param message
+ * the message for this job.
+ */
+ public synchronized void setMessage(String message) {
+ this.message = message;
+ }
+
+ /**
+ * @return the depending jobs of this job
+ */
+ public List<CrunchControlledJob> getDependentJobs() {
+ return this.dependingJobs;
+ }
+
+ /**
+ * Add a job to this jobs' dependency list. Dependent jobs can only be added
+ * while a Job is waiting to run, not during or afterwards.
+ *
+ * @param dependingJob
+ * Job that this Job depends on.
+ * @return <tt>true</tt> if the Job was added.
+ */
+ public synchronized boolean addDependingJob(CrunchControlledJob dependingJob) {
+ if (this.state == State.WAITING) { // only allowed to add jobs when waiting
+ return this.dependingJobs.add(dependingJob);
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * @return true if this job is in a complete state
+ */
+ public synchronized boolean isCompleted() {
+ return this.state == State.FAILED || this.state == State.DEPENDENT_FAILED
+ || this.state == State.SUCCESS;
+ }
+
+ /**
+ * @return true if this job is in READY state
+ */
+ public synchronized boolean isReady() {
+ return this.state == State.READY;
+ }
+
+ public void killJob() throws IOException, InterruptedException {
+ job.killJob();
+ }
+
+ /**
+ * Check the state of this running job. The state may remain the same, become
+ * SUCCEEDED or FAILED.
+ */
+ private void checkRunningState() throws IOException, InterruptedException {
+ try {
+ if (job.isComplete()) {
+ if (job.isSuccessful()) {
+ this.state = State.SUCCESS;
+ } else {
+ this.state = State.FAILED;
+ this.message = "Job failed!";
+ }
+ } else {
+ // still running
+ if (job.getConfiguration().getBoolean(RuntimeParameters.LOG_JOB_PROGRESS, false)) {
+ logJobProgress();
+ }
+ }
+ } catch (IOException ioe) {
+ this.state = State.FAILED;
+ this.message = StringUtils.stringifyException(ioe);
+ try {
+ if (job != null) {
+ job.killJob();
+ }
+ } catch (IOException e) {
+ }
+ }
+ if (isCompleted()) {
+ completionHook.run();
+ }
+ }
+
+ /**
+ * Check and update the state of this job. The state changes depending on its
+ * current state and the states of the depending jobs.
+ */
+ synchronized State checkState() throws IOException, InterruptedException {
+ if (this.state == State.RUNNING) {
+ checkRunningState();
+ }
+ if (this.state != State.WAITING) {
+ return this.state;
+ }
+ if (this.dependingJobs == null || this.dependingJobs.size() == 0) {
+ this.state = State.READY;
+ return this.state;
+ }
+ CrunchControlledJob pred = null;
+ int n = this.dependingJobs.size();
+ for (int i = 0; i < n; i++) {
+ pred = this.dependingJobs.get(i);
+ State s = pred.checkState();
+ if (s == State.WAITING || s == State.READY || s == State.RUNNING) {
+ break; // a pred is still not completed, continue in WAITING
+ // state
+ }
+ if (s == State.FAILED || s == State.DEPENDENT_FAILED) {
+ this.state = State.DEPENDENT_FAILED;
+ this.message = "depending job " + i + " with jobID " + pred.getJobID()
+ + " failed. " + pred.getMessage();
+ break;
+ }
+ // pred must be in success state
+ if (i == n - 1) {
+ this.state = State.READY;
+ }
+ }
+
+ return this.state;
+ }
+
+ /**
+ * Submit this job to mapred. The state becomes RUNNING if submission is
+ * successful, FAILED otherwise.
+ */
+ protected synchronized void submit() {
+ try {
+ prepareHook.run();
+ job.submit();
+ this.state = State.RUNNING;
+ LOG.info("Running job \"" + getJobName() + "\"");
+ LOG.info("Job status available at: " + job.getTrackingURL());
+ } catch (Exception ioe) {
+ this.state = State.FAILED;
+ this.message = StringUtils.stringifyException(ioe);
+ LOG.info("Error occurred starting job \"" + getJobName() + "\":");
+ LOG.info(getMessage());
+ }
+ }
+
+ private void logJobProgress() throws IOException, InterruptedException {
+ String progress = String.format("map %.0f%% reduce %.0f%%",
+ 100.0 * job.mapProgress(), 100.0 * job.reduceProgress());
+ if (!Objects.equal(lastKnownProgress, progress)) {
+ LOG.info(job.getJobName() + " progress: " + progress);
+ lastKnownProgress = progress;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchJobControl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchJobControl.java b/crunch-core/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchJobControl.java
new file mode 100644
index 0000000..727ab6f
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchJobControl.java
@@ -0,0 +1,211 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.hadoop.mapreduce.lib.jobcontrol;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchControlledJob.State;
+
+/**
+ * This class encapsulates a set of MapReduce jobs and its dependency.
+ *
+ * It tracks the states of the jobs by placing them into different tables
+ * according to their states.
+ *
+ * This class provides APIs for the client app to add a job to the group and to
+ * get the jobs in the group in different states. When a job is added, an ID
+ * unique to the group is assigned to the job.
+ */
+public class CrunchJobControl {
+
+ private Map<Integer, CrunchControlledJob> waitingJobs;
+ private Map<Integer, CrunchControlledJob> readyJobs;
+ private Map<Integer, CrunchControlledJob> runningJobs;
+ private Map<Integer, CrunchControlledJob> successfulJobs;
+ private Map<Integer, CrunchControlledJob> failedJobs;
+
+ private Log log = LogFactory.getLog(CrunchJobControl.class);
+
+ private final String groupName;
+
+ /**
+ * Construct a job control for a group of jobs.
+ *
+ * @param groupName
+ * a name identifying this group
+ */
+ public CrunchJobControl(String groupName) {
+ this.waitingJobs = new Hashtable<Integer, CrunchControlledJob>();
+ this.readyJobs = new Hashtable<Integer, CrunchControlledJob>();
+ this.runningJobs = new Hashtable<Integer, CrunchControlledJob>();
+ this.successfulJobs = new Hashtable<Integer, CrunchControlledJob>();
+ this.failedJobs = new Hashtable<Integer, CrunchControlledJob>();
+ this.groupName = groupName;
+ }
+
+ private static List<CrunchControlledJob> toList(Map<Integer, CrunchControlledJob> jobs) {
+ ArrayList<CrunchControlledJob> retv = new ArrayList<CrunchControlledJob>();
+ synchronized (jobs) {
+ for (CrunchControlledJob job : jobs.values()) {
+ retv.add(job);
+ }
+ }
+ return retv;
+ }
+
+ /**
+ * @return the jobs in the waiting state
+ */
+ public List<CrunchControlledJob> getWaitingJobList() {
+ return toList(this.waitingJobs);
+ }
+
+ /**
+ * @return the jobs in the running state
+ */
+ public List<CrunchControlledJob> getRunningJobList() {
+ return toList(this.runningJobs);
+ }
+
+ /**
+ * @return the jobs in the ready state
+ */
+ public List<CrunchControlledJob> getReadyJobsList() {
+ return toList(this.readyJobs);
+ }
+
+ /**
+ * @return the jobs in the success state
+ */
+ public List<CrunchControlledJob> getSuccessfulJobList() {
+ return toList(this.successfulJobs);
+ }
+
+ public List<CrunchControlledJob> getFailedJobList() {
+ return toList(this.failedJobs);
+ }
+
+ private static void addToQueue(CrunchControlledJob aJob,
+ Map<Integer, CrunchControlledJob> queue) {
+ synchronized (queue) {
+ queue.put(aJob.getJobID(), aJob);
+ }
+ }
+
+ private void addToQueue(CrunchControlledJob aJob) {
+ Map<Integer, CrunchControlledJob> queue = getQueue(aJob.getJobState());
+ addToQueue(aJob, queue);
+ }
+
+ private Map<Integer, CrunchControlledJob> getQueue(State state) {
+ Map<Integer, CrunchControlledJob> retv = null;
+ if (state == State.WAITING) {
+ retv = this.waitingJobs;
+ } else if (state == State.READY) {
+ retv = this.readyJobs;
+ } else if (state == State.RUNNING) {
+ retv = this.runningJobs;
+ } else if (state == State.SUCCESS) {
+ retv = this.successfulJobs;
+ } else if (state == State.FAILED || state == State.DEPENDENT_FAILED) {
+ retv = this.failedJobs;
+ }
+ return retv;
+ }
+
+ /**
+ * Add a new job.
+ *
+ * @param aJob
+ * the new job
+ */
+ synchronized public void addJob(CrunchControlledJob aJob) {
+ aJob.setJobState(State.WAITING);
+ this.addToQueue(aJob);
+ }
+
+ synchronized private void checkRunningJobs() throws IOException,
+ InterruptedException {
+
+ Map<Integer, CrunchControlledJob> oldJobs = null;
+ oldJobs = this.runningJobs;
+ this.runningJobs = new Hashtable<Integer, CrunchControlledJob>();
+
+ for (CrunchControlledJob nextJob : oldJobs.values()) {
+ nextJob.checkState();
+ this.addToQueue(nextJob);
+ }
+ }
+
+ synchronized private void checkWaitingJobs() throws IOException,
+ InterruptedException {
+ Map<Integer, CrunchControlledJob> oldJobs = null;
+ oldJobs = this.waitingJobs;
+ this.waitingJobs = new Hashtable<Integer, CrunchControlledJob>();
+
+ for (CrunchControlledJob nextJob : oldJobs.values()) {
+ nextJob.checkState();
+ this.addToQueue(nextJob);
+ }
+ }
+
+ synchronized private void startReadyJobs() {
+ Map<Integer, CrunchControlledJob> oldJobs = null;
+ oldJobs = this.readyJobs;
+ this.readyJobs = new Hashtable<Integer, CrunchControlledJob>();
+
+ for (CrunchControlledJob nextJob : oldJobs.values()) {
+ // Submitting Job to Hadoop
+ nextJob.submit();
+ this.addToQueue(nextJob);
+ }
+ }
+
+ synchronized public void killAllRunningJobs() {
+ for (CrunchControlledJob job : runningJobs.values()) {
+ if (!job.isCompleted()) {
+ try {
+ job.killJob();
+ } catch (Exception e) {
+ log.error("Exception killing job: " + job.getJobName(), e);
+ }
+ }
+ }
+ }
+
+ synchronized public boolean allFinished() {
+ return this.waitingJobs.size() == 0 && this.readyJobs.size() == 0
+ && this.runningJobs.size() == 0;
+ }
+
+ /**
+ * Checks the states of the running jobs Update the states of waiting jobs, and submits the jobs in
+ * ready state (i.e. whose dependencies are all finished in success).
+ */
+ public void pollJobStatusAndStartNewOnes() throws IOException, InterruptedException {
+ checkRunningJobs();
+ checkWaitingJobs();
+ startReadyJobs();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/SingleUseIterable.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/SingleUseIterable.java b/crunch-core/src/main/java/org/apache/crunch/impl/SingleUseIterable.java
new file mode 100644
index 0000000..98f982f
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/SingleUseIterable.java
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl;
+
+import java.util.Iterator;
+
+/**
+ * Wrapper around a Reducer's input Iterable. Ensures that the
+ * {@link #iterator()} method is not called more than once.
+ */
+public class SingleUseIterable<T> implements Iterable<T> {
+
+ private boolean used = false;
+ private Iterable<T> wrappedIterable;
+
+ /**
+ * Instantiate around an Iterable that may only be used once.
+ *
+ * @param toWrap iterable to wrap
+ */
+ public SingleUseIterable(Iterable<T> toWrap) {
+ this.wrappedIterable = toWrap;
+ }
+
+ @Override
+ public Iterator<T> iterator() {
+ if (used) {
+ throw new IllegalStateException("iterator() can only be called once on this Iterable");
+ }
+ used = true;
+ return wrappedIterable.iterator();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mem/MemPipeline.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mem/MemPipeline.java b/crunch-core/src/main/java/org/apache/crunch/impl/mem/MemPipeline.java
new file mode 100644
index 0000000..272b2af
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mem/MemPipeline.java
@@ -0,0 +1,275 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mem;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.PipelineExecution;
+import org.apache.crunch.PipelineResult;
+import org.apache.crunch.Source;
+import org.apache.crunch.TableSource;
+import org.apache.crunch.Target;
+import org.apache.crunch.Target.WriteMode;
+import org.apache.crunch.impl.mem.collect.MemCollection;
+import org.apache.crunch.impl.mem.collect.MemTable;
+import org.apache.crunch.io.At;
+import org.apache.crunch.io.PathTarget;
+import org.apache.crunch.io.ReadableSource;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Counters;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+public class MemPipeline implements Pipeline {
+
+ private static final Log LOG = LogFactory.getLog(MemPipeline.class);
+ private static Counters COUNTERS = new Counters();
+ private static final MemPipeline INSTANCE = new MemPipeline();
+
+ private int outputIndex = 0;
+
+ public static Counters getCounters() {
+ return COUNTERS;
+ }
+
+ public static void clearCounters() {
+ COUNTERS = new Counters();
+ }
+
+ public static Pipeline getInstance() {
+ return INSTANCE;
+ }
+
+ public static <T> PCollection<T> collectionOf(T... ts) {
+ return new MemCollection<T>(ImmutableList.copyOf(ts));
+ }
+
+ public static <T> PCollection<T> collectionOf(Iterable<T> collect) {
+ return new MemCollection<T>(collect);
+ }
+
+ public static <T> PCollection<T> typedCollectionOf(PType<T> ptype, T... ts) {
+ return new MemCollection<T>(ImmutableList.copyOf(ts), ptype, null);
+ }
+
+ public static <T> PCollection<T> typedCollectionOf(PType<T> ptype, Iterable<T> collect) {
+ return new MemCollection<T>(collect, ptype, null);
+ }
+
+ public static <S, T> PTable<S, T> tableOf(S s, T t, Object... more) {
+ List<Pair<S, T>> pairs = Lists.newArrayList();
+ pairs.add(Pair.of(s, t));
+ for (int i = 0; i < more.length; i += 2) {
+ pairs.add(Pair.of((S) more[i], (T) more[i + 1]));
+ }
+ return new MemTable<S, T>(pairs);
+ }
+
+ public static <S, T> PTable<S, T> typedTableOf(PTableType<S, T> ptype, S s, T t, Object... more) {
+ List<Pair<S, T>> pairs = Lists.newArrayList();
+ pairs.add(Pair.of(s, t));
+ for (int i = 0; i < more.length; i += 2) {
+ pairs.add(Pair.of((S) more[i], (T) more[i + 1]));
+ }
+ return new MemTable<S, T>(pairs, ptype, null);
+ }
+
+ public static <S, T> PTable<S, T> tableOf(Iterable<Pair<S, T>> pairs) {
+ return new MemTable<S, T>(pairs);
+ }
+
+ public static <S, T> PTable<S, T> typedTableOf(PTableType<S, T> ptype, Iterable<Pair<S, T>> pairs) {
+ return new MemTable<S, T>(pairs, ptype, null);
+ }
+
+ private Configuration conf = new Configuration();
+ private Set<Target> activeTargets = Sets.newHashSet();
+
+ private MemPipeline() {
+ }
+
+ @Override
+ public void setConfiguration(Configuration conf) {
+ this.conf = conf;
+ }
+
+ @Override
+ public Configuration getConfiguration() {
+ return conf;
+ }
+
+ @Override
+ public <T> PCollection<T> read(Source<T> source) {
+ if (source instanceof ReadableSource) {
+ try {
+ Iterable<T> iterable = ((ReadableSource<T>) source).read(conf);
+ return new MemCollection<T>(iterable, source.getType(), source.toString());
+ } catch (IOException e) {
+ LOG.error("Exception reading source: " + source.toString(), e);
+ throw new IllegalStateException(e);
+ }
+ }
+ LOG.error("Source " + source + " is not readable");
+ throw new IllegalStateException("Source " + source + " is not readable");
+ }
+
+ @Override
+ public <K, V> PTable<K, V> read(TableSource<K, V> source) {
+ if (source instanceof ReadableSource) {
+ try {
+ Iterable<Pair<K, V>> iterable = ((ReadableSource<Pair<K, V>>) source).read(conf);
+ return new MemTable<K, V>(iterable, source.getTableType(), source.toString());
+ } catch (IOException e) {
+ LOG.error("Exception reading source: " + source.toString(), e);
+ throw new IllegalStateException(e);
+ }
+ }
+ LOG.error("Source " + source + " is not readable");
+ throw new IllegalStateException("Source " + source + " is not readable");
+ }
+
+ @Override
+ public void write(PCollection<?> collection, Target target) {
+ write(collection, target, Target.WriteMode.DEFAULT);
+ }
+
+ @Override
+ public void write(PCollection<?> collection, Target target,
+ Target.WriteMode writeMode) {
+ target.handleExisting(writeMode, getConfiguration());
+ if (writeMode != WriteMode.APPEND && activeTargets.contains(target)) {
+ throw new CrunchRuntimeException("Target " + target + " is already written in the current run." +
+ " Use WriteMode.APPEND in order to write additional data to it.");
+ }
+ activeTargets.add(target);
+ if (target instanceof PathTarget) {
+ Path path = ((PathTarget) target).getPath();
+ try {
+ FileSystem fs = path.getFileSystem(conf);
+ FSDataOutputStream os = fs.create(new Path(path, "out" + outputIndex));
+ outputIndex++;
+ if (collection instanceof PTable) {
+ for (Object o : collection.materialize()) {
+ Pair p = (Pair) o;
+ os.writeBytes(p.first().toString());
+ os.writeBytes("\t");
+ os.writeBytes(p.second().toString());
+ os.writeBytes("\r\n");
+ }
+ } else {
+ for (Object o : collection.materialize()) {
+ os.writeBytes(o.toString() + "\r\n");
+ }
+ }
+ os.close();
+ } catch (IOException e) {
+ LOG.error("Exception writing target: " + target, e);
+ }
+ } else {
+ LOG.error("Target " + target + " is not a PathTarget instance");
+ }
+ }
+
+ @Override
+ public PCollection<String> readTextFile(String pathName) {
+ return read(At.textFile(pathName));
+ }
+
+ @Override
+ public <T> void writeTextFile(PCollection<T> collection, String pathName) {
+ write(collection, At.textFile(pathName));
+ }
+
+ @Override
+ public <T> Iterable<T> materialize(PCollection<T> pcollection) {
+ return pcollection.materialize();
+ }
+
+ @Override
+ public PipelineExecution runAsync() {
+ activeTargets.clear();
+ return new PipelineExecution() {
+ @Override
+ public String getPlanDotFile() {
+ return "";
+ }
+
+ @Override
+ public void waitFor(long timeout, TimeUnit timeUnit) throws InterruptedException {
+ // no-po
+ }
+
+ @Override
+ public void waitUntilDone() throws InterruptedException {
+ // no-po
+ }
+
+ @Override
+ public Status getStatus() {
+ return Status.SUCCEEDED;
+ }
+
+ @Override
+ public PipelineResult getResult() {
+ return new PipelineResult(ImmutableList.of(new PipelineResult.StageResult("MemPipelineStage", COUNTERS)));
+ }
+
+ @Override
+ public void kill() {
+ }
+ };
+ }
+
+ @Override
+ public PipelineResult run() {
+ activeTargets.clear();
+ return new PipelineResult(ImmutableList.of(new PipelineResult.StageResult("MemPipelineStage", COUNTERS)));
+ }
+
+ @Override
+ public PipelineResult done() {
+ return run();
+ }
+
+ @Override
+ public void enableDebug() {
+ LOG.info("Note: in-memory pipelines do not have debug logging");
+ }
+
+ @Override
+ public String getName() {
+ return "Memory Pipeline";
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemCollection.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemCollection.java b/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemCollection.java
new file mode 100644
index 0000000..c97fac6
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemCollection.java
@@ -0,0 +1,295 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mem.collect;
+
+import java.lang.reflect.Method;
+import java.util.Collection;
+
+import javassist.util.proxy.MethodFilter;
+import javassist.util.proxy.MethodHandler;
+import javassist.util.proxy.ProxyFactory;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.FilterFn;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PObject;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.ParallelDoOptions;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.Target;
+import org.apache.crunch.fn.ExtractKeyFn;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mem.emit.InMemoryEmitter;
+import org.apache.crunch.lib.Aggregate;
+import org.apache.crunch.materialize.pobject.CollectionPObject;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.OutputCommitter;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.StatusReporter;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+
+public class MemCollection<S> implements PCollection<S> {
+
+ private final Collection<S> collect;
+ private final PType<S> ptype;
+ private String name;
+
+ public MemCollection(Iterable<S> collect) {
+ this(collect, null, null);
+ }
+
+ public MemCollection(Iterable<S> collect, PType<S> ptype) {
+ this(collect, ptype, null);
+ }
+
+ public MemCollection(Iterable<S> collect, PType<S> ptype, String name) {
+ this.collect = ImmutableList.copyOf(collect);
+ this.ptype = ptype;
+ this.name = name;
+ }
+
+ @Override
+ public Pipeline getPipeline() {
+ return MemPipeline.getInstance();
+ }
+
+ @Override
+ public PCollection<S> union(PCollection<S> other) {
+ return union(new PCollection[] { other });
+ }
+
+ @Override
+ public PCollection<S> union(PCollection<S>... collections) {
+ Collection<S> output = Lists.newArrayList();
+ for (PCollection<S> pcollect : collections) {
+ for (S s : pcollect.materialize()) {
+ output.add(s);
+ }
+ }
+ output.addAll(collect);
+ return new MemCollection<S>(output, collections[0].getPType());
+ }
+
+ @Override
+ public <T> PCollection<T> parallelDo(DoFn<S, T> doFn, PType<T> type) {
+ return parallelDo(null, doFn, type);
+ }
+
+ @Override
+ public <T> PCollection<T> parallelDo(String name, DoFn<S, T> doFn, PType<T> type) {
+ return parallelDo(name, doFn, type, ParallelDoOptions.builder().build());
+ }
+
+ @Override
+ public <T> PCollection<T> parallelDo(String name, DoFn<S, T> doFn, PType<T> type,
+ ParallelDoOptions options) {
+ InMemoryEmitter<T> emitter = new InMemoryEmitter<T>();
+ doFn.setContext(getInMemoryContext(getPipeline().getConfiguration()));
+ doFn.initialize();
+ for (S s : collect) {
+ doFn.process(s, emitter);
+ }
+ doFn.cleanup(emitter);
+ return new MemCollection<T>(emitter.getOutput(), type, name);
+ }
+
+ @Override
+ public <K, V> PTable<K, V> parallelDo(DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type) {
+ return parallelDo(null, doFn, type);
+ }
+
+ @Override
+ public <K, V> PTable<K, V> parallelDo(String name, DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type) {
+ return parallelDo(name, doFn, type, ParallelDoOptions.builder().build());
+ }
+
+ @Override
+ public <K, V> PTable<K, V> parallelDo(String name, DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type,
+ ParallelDoOptions options) {
+ InMemoryEmitter<Pair<K, V>> emitter = new InMemoryEmitter<Pair<K, V>>();
+ doFn.setContext(getInMemoryContext(getPipeline().getConfiguration()));
+ doFn.initialize();
+ for (S s : collect) {
+ doFn.process(s, emitter);
+ }
+ doFn.cleanup(emitter);
+ return new MemTable<K, V>(emitter.getOutput(), type, name);
+ }
+
+ @Override
+ public PCollection<S> write(Target target) {
+ getPipeline().write(this, target);
+ return this;
+ }
+
+ @Override
+ public PCollection<S> write(Target target, Target.WriteMode writeMode) {
+ getPipeline().write(this, target, writeMode);
+ return this;
+ }
+
+ @Override
+ public Iterable<S> materialize() {
+ return collect;
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public PObject<Collection<S>> asCollection() {
+ return new CollectionPObject<S>(this);
+ }
+
+ public Collection<S> getCollection() {
+ return collect;
+ }
+
+ @Override
+ public PType<S> getPType() {
+ return ptype;
+ }
+
+ @Override
+ public PTypeFamily getTypeFamily() {
+ if (ptype != null) {
+ return ptype.getFamily();
+ }
+ return null;
+ }
+
+ @Override
+ public long getSize() {
+ return collect.isEmpty() ? 0 : 1; // getSize is only used for pipeline optimization in MR
+ }
+
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ @Override
+ public String toString() {
+ return collect.toString();
+ }
+
+ @Override
+ public PTable<S, Long> count() {
+ return Aggregate.count(this);
+ }
+
+ @Override
+ public PObject<Long> length() {
+ return Aggregate.length(this);
+ }
+
+ @Override
+ public PObject<S> max() {
+ return Aggregate.max(this);
+ }
+
+ @Override
+ public PObject<S> min() {
+ return Aggregate.min(this);
+ }
+
+ @Override
+ public PCollection<S> filter(FilterFn<S> filterFn) {
+ return parallelDo(filterFn, getPType());
+ }
+
+ @Override
+ public PCollection<S> filter(String name, FilterFn<S> filterFn) {
+ return parallelDo(name, filterFn, getPType());
+ }
+
+ @Override
+ public <K> PTable<K, S> by(MapFn<S, K> mapFn, PType<K> keyType) {
+ return parallelDo(new ExtractKeyFn<K, S>(mapFn), getTypeFamily().tableOf(keyType, getPType()));
+ }
+
+ @Override
+ public <K> PTable<K, S> by(String name, MapFn<S, K> mapFn, PType<K> keyType) {
+ return parallelDo(name, new ExtractKeyFn<K, S>(mapFn), getTypeFamily().tableOf(keyType, getPType()));
+ }
+
+ /**
+ * The method creates a {@link TaskInputOutputContext} that will just provide
+ * {@linkplain Configuration}. The method has been implemented with javaassist
+ * as there are API changes in versions of Hadoop. In hadoop 1.0.3 the
+ * {@linkplain TaskInputOutputContext} is abstract class while in version 2
+ * the same is an interface.
+ * <p>
+ * Note: The intention of this is to provide the bare essentials that are
+ * required to make the {@linkplain MemPipeline} work. It lacks even the basic
+ * things that can proved some support for unit testing pipeline.
+ */
+ private static TaskInputOutputContext<?, ?, ?, ?> getInMemoryContext(final Configuration conf) {
+ ProxyFactory factory = new ProxyFactory();
+ Class<TaskInputOutputContext> superType = TaskInputOutputContext.class;
+ Class[] types = new Class[0];
+ Object[] args = new Object[0];
+ if (superType.isInterface()) {
+ factory.setInterfaces(new Class[] { superType });
+ } else {
+ types = new Class[] { Configuration.class, TaskAttemptID.class, RecordWriter.class, OutputCommitter.class,
+ StatusReporter.class };
+ args = new Object[] { conf, new TaskAttemptID(), null, null, null };
+ factory.setSuperclass(superType);
+ }
+ factory.setFilter(new MethodFilter() {
+ @Override
+ public boolean isHandled(Method m) {
+ String name = m.getName();
+ return "getConfiguration".equals(name) || "getCounter".equals(name) || "progress".equals(name);
+ }
+ });
+ MethodHandler handler = new MethodHandler() {
+ @Override
+ public Object invoke(Object arg0, Method m, Method arg2, Object[] args) throws Throwable {
+ String name = m.getName();
+ if ("getConfiguration".equals(name)) {
+ return conf;
+ } else if ("progress".equals(name)) {
+ // no-op
+ return null;
+ } else { // getCounter
+ if (args.length == 1) {
+ return MemPipeline.getCounters().findCounter((Enum<?>) args[0]);
+ } else {
+ return MemPipeline.getCounters().findCounter((String) args[0], (String) args[1]);
+ }
+ }
+ }
+ };
+ try {
+ Object newInstance = factory.create(types, args, handler);
+ return (TaskInputOutputContext<?, ?, ?, ?>) newInstance;
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw new RuntimeException(e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemGroupedTable.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemGroupedTable.java b/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemGroupedTable.java
new file mode 100644
index 0000000..d105bb4
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemGroupedTable.java
@@ -0,0 +1,113 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mem.collect;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.crunch.Aggregator;
+import org.apache.crunch.CombineFn;
+import org.apache.crunch.GroupingOptions;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PGroupedTable;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.Target;
+import org.apache.crunch.fn.Aggregators;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+class MemGroupedTable<K, V> extends MemCollection<Pair<K, Iterable<V>>> implements PGroupedTable<K, V> {
+
+ private final MemTable<K, V> parent;
+
+ private static <S, T> Iterable<Pair<S, Iterable<T>>> buildMap(MemTable<S, T> parent, GroupingOptions options) {
+ PType<S> keyType = parent.getKeyType();
+ Shuffler<S, T> shuffler = Shuffler.create(keyType, options, parent.getPipeline());
+
+ for (Pair<S, T> pair : parent.materialize()) {
+ shuffler.add(pair);
+ }
+
+ return shuffler;
+ }
+
+ public MemGroupedTable(MemTable<K, V> parent, GroupingOptions options) {
+ super(buildMap(parent, options));
+ this.parent = parent;
+ }
+
+ @Override
+ public PCollection<Pair<K, Iterable<V>>> union(PCollection<Pair<K, Iterable<V>>>... collections) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public PCollection<Pair<K, Iterable<V>>> write(Target target) {
+ getPipeline().write(this.ungroup(), target);
+ return this;
+ }
+
+ @Override
+ public PType<Pair<K, Iterable<V>>> getPType() {
+ PTableType<K, V> parentType = parent.getPTableType();
+ if (parentType != null) {
+ return parentType.getGroupedTableType();
+ }
+ return null;
+ }
+
+ @Override
+ public PTypeFamily getTypeFamily() {
+ return parent.getTypeFamily();
+ }
+
+ @Override
+ public long getSize() {
+ return 1; // getSize is only used for pipeline optimization in MR
+ }
+
+ @Override
+ public String getName() {
+ return "MemGrouped(" + parent.getName() + ")";
+ }
+
+ @Override
+ public PTable<K, V> combineValues(CombineFn<K, V> combineFn) {
+ return parallelDo(combineFn, parent.getPTableType());
+ }
+
+ @Override
+ public PTable<K, V> combineValues(Aggregator<V> agg) {
+ return combineValues(Aggregators.<K, V>toCombineFn(agg));
+ }
+
+ @Override
+ public PTable<K, V> ungroup() {
+ return parent;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemTable.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemTable.java b/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemTable.java
new file mode 100644
index 0000000..f8a5960
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/MemTable.java
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mem.collect;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.crunch.FilterFn;
+import org.apache.crunch.GroupingOptions;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PGroupedTable;
+import org.apache.crunch.PObject;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Target;
+import org.apache.crunch.lib.Aggregate;
+import org.apache.crunch.lib.Cogroup;
+import org.apache.crunch.lib.Join;
+import org.apache.crunch.lib.PTables;
+import org.apache.crunch.materialize.MaterializableMap;
+import org.apache.crunch.materialize.pobject.MapPObject;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+
+import com.google.common.collect.Lists;
+
+public class MemTable<K, V> extends MemCollection<Pair<K, V>> implements PTable<K, V> {
+
+ private PTableType<K, V> ptype;
+
+ public MemTable(Iterable<Pair<K, V>> collect) {
+ this(collect, null, null);
+ }
+
+ public MemTable(Iterable<Pair<K, V>> collect, PTableType<K, V> ptype, String name) {
+ super(collect, ptype, name);
+ this.ptype = ptype;
+ }
+
+ @Override
+ public PTable<K, V> union(PTable<K, V> other) {
+ return union(new PTable[] { other });
+ }
+
+ @Override
+ public PTable<K, V> union(PTable<K, V>... others) {
+ List<Pair<K, V>> values = Lists.newArrayList();
+ values.addAll(getCollection());
+ for (PTable<K, V> ptable : others) {
+ for (Pair<K, V> p : ptable.materialize()) {
+ values.add(p);
+ }
+ }
+ return new MemTable<K, V>(values, others[0].getPTableType(), null);
+ }
+
+ @Override
+ public PGroupedTable<K, V> groupByKey() {
+ return groupByKey(null);
+ }
+
+ @Override
+ public PGroupedTable<K, V> groupByKey(int numPartitions) {
+ return groupByKey(null);
+ }
+
+ @Override
+ public PGroupedTable<K, V> groupByKey(GroupingOptions options) {
+ return new MemGroupedTable<K, V>(this, options);
+ }
+
+ @Override
+ public PTable<K, V> write(Target target) {
+ super.write(target);
+ return this;
+ }
+
+ @Override
+ public PTable<K, V> write(Target target, Target.WriteMode writeMode) {
+ getPipeline().write(this, target, writeMode);
+ return this;
+ }
+
+ @Override
+ public PTableType<K, V> getPTableType() {
+ return ptype;
+ }
+
+ @Override
+ public PType<K> getKeyType() {
+ if (ptype != null) {
+ return ptype.getKeyType();
+ }
+ return null;
+ }
+
+ @Override
+ public PType<V> getValueType() {
+ if (ptype != null) {
+ return ptype.getValueType();
+ }
+ return null;
+ }
+
+ @Override
+ public PTable<K, V> filter(FilterFn<Pair<K, V>> filterFn) {
+ return parallelDo(filterFn, getPTableType());
+ }
+
+ @Override
+ public PTable<K, V> filter(String name, FilterFn<Pair<K, V>> filterFn) {
+ return parallelDo(name, filterFn, getPTableType());
+ }
+
+ @Override
+ public PTable<K, V> top(int count) {
+ return Aggregate.top(this, count, true);
+ }
+
+ @Override
+ public PTable<K, V> bottom(int count) {
+ return Aggregate.top(this, count, false);
+ }
+
+ @Override
+ public PTable<K, Collection<V>> collectValues() {
+ return Aggregate.collectValues(this);
+ }
+
+ @Override
+ public <U> PTable<K, Pair<V, U>> join(PTable<K, U> other) {
+ return Join.join(this, other);
+ }
+
+ @Override
+ public <U> PTable<K, Pair<Collection<V>, Collection<U>>> cogroup(PTable<K, U> other) {
+ return Cogroup.cogroup(this, other);
+ }
+
+ @Override
+ public PCollection<K> keys() {
+ return PTables.keys(this);
+ }
+
+ @Override
+ public PCollection<V> values() {
+ return PTables.values(this);
+ }
+
+ @Override
+ public Map<K, V> materializeToMap() {
+ return new MaterializableMap<K, V>(this.materialize());
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ public PObject<Map<K, V>> asMap() {
+ return new MapPObject<K, V>(this);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/Shuffler.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/Shuffler.java b/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/Shuffler.java
new file mode 100644
index 0000000..2e8f9eb
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mem/collect/Shuffler.java
@@ -0,0 +1,149 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mem.collect;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.TreeMap;
+
+import org.apache.crunch.GroupingOptions;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.SingleUseIterable;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+/**
+ * In-memory versions of common MapReduce patterns for aggregating key-value data.
+ */
+abstract class Shuffler<K, V> implements Iterable<Pair<K, Iterable<V>>> {
+
+ public abstract void add(Pair<K, V> record);
+
+ private static <K, V> Map<K, V> getMapForKeyType(PType<?> ptype) {
+ if (ptype != null && Comparable.class.isAssignableFrom(ptype.getTypeClass())) {
+ return new TreeMap<K, V>();
+ } else {
+ return Maps.newHashMap();
+ }
+ }
+
+ public static <S, T> Shuffler<S, T> create(PType<S> keyType, GroupingOptions options,
+ Pipeline pipeline) {
+ Map<S, Collection<T>> map = getMapForKeyType(keyType);
+
+ if (options != null) {
+ if (Pair.class.equals(keyType.getTypeClass()) && options.getGroupingComparatorClass() != null) {
+ PType<?> pairKey = keyType.getSubTypes().get(0);
+ return new SecondarySortShuffler(getMapForKeyType(pairKey));
+ } else if (options.getSortComparatorClass() != null) {
+ RawComparator<S> rc = ReflectionUtils.newInstance(options.getSortComparatorClass(),
+ pipeline.getConfiguration());
+ map = new TreeMap<S, Collection<T>>(rc);
+ }
+ }
+
+ return new MapShuffler<S, T>(map);
+ }
+
+ private static class HFunction<K, V> implements Function<Map.Entry<K, Collection<V>>, Pair<K, Iterable<V>>> {
+ @Override
+ public Pair<K, Iterable<V>> apply(Map.Entry<K, Collection<V>> input) {
+ return Pair.<K, Iterable<V>>of(input.getKey(), new SingleUseIterable<V>(input.getValue()));
+ }
+ }
+
+ private static class MapShuffler<K, V> extends Shuffler<K, V> {
+ private final Map<K, Collection<V>> map;
+
+ public MapShuffler(Map<K, Collection<V>> map) {
+ this.map = map;
+ }
+
+ @Override
+ public Iterator<Pair<K, Iterable<V>>> iterator() {
+ return Iterators.transform(map.entrySet().iterator(),
+ new HFunction<K, V>());
+ }
+
+ @Override
+ public void add(Pair<K, V> record) {
+ if (!map.containsKey(record.first())) {
+ Collection<V> values = Lists.newArrayList();
+ map.put(record.first(), values);
+ }
+ map.get(record.first()).add(record.second());
+ }
+ }
+
+ private static class SSFunction<K, SK, V> implements
+ Function<Map.Entry<K, List<Pair<SK, V>>>, Pair<Pair<K, SK>, Iterable<V>>> {
+ @Override
+ public Pair<Pair<K, SK>, Iterable<V>> apply(Entry<K, List<Pair<SK, V>>> input) {
+ List<Pair<SK, V>> values = input.getValue();
+ Collections.sort(values, new Comparator<Pair<SK, V>>() {
+ @Override
+ public int compare(Pair<SK, V> o1, Pair<SK, V> o2) {
+ return ((Comparable) o1.first()).compareTo(o2.first());
+ }
+ });
+ Pair<K, SK> key = Pair.of(input.getKey(), values.get(0).first());
+ return Pair.of(key, Iterables.transform(values, new Function<Pair<SK, V>, V>() {
+ @Override
+ public V apply(Pair<SK, V> input) {
+ return input.second();
+ }
+ }));
+ }
+ }
+
+ private static class SecondarySortShuffler<K, SK, V> extends Shuffler<Pair<K, SK>, V> {
+
+ private Map<K, List<Pair<SK, V>>> map;
+
+ public SecondarySortShuffler(Map<K, List<Pair<SK, V>>> map) {
+ this.map = map;
+ }
+
+ @Override
+ public Iterator<Pair<Pair<K, SK>, Iterable<V>>> iterator() {
+ return Iterators.transform(map.entrySet().iterator(), new SSFunction<K, SK, V>());
+ }
+
+ @Override
+ public void add(Pair<Pair<K, SK>, V> record) {
+ K primary = record.first().first();
+ if (!map.containsKey(primary)) {
+ map.put(primary, Lists.<Pair<SK, V>>newArrayList());
+ }
+ map.get(primary).add(Pair.of(record.first().second(), record.second()));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mem/emit/InMemoryEmitter.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mem/emit/InMemoryEmitter.java b/crunch-core/src/main/java/org/apache/crunch/impl/mem/emit/InMemoryEmitter.java
new file mode 100644
index 0000000..6976615
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mem/emit/InMemoryEmitter.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mem.emit;
+
+import java.util.List;
+
+import org.apache.crunch.Emitter;
+
+import com.google.common.collect.Lists;
+
+/**
+ * An {@code Emitter} instance that writes emitted records to a backing
+ * {@code List}.
+ *
+ * @param <T>
+ */
+public class InMemoryEmitter<T> implements Emitter<T> {
+
+ private final List<T> output;
+
+ public InMemoryEmitter() {
+ this(Lists.<T> newArrayList());
+ }
+
+ public InMemoryEmitter(List<T> output) {
+ this.output = output;
+ }
+
+ @Override
+ public void emit(T emitted) {
+ output.add(emitted);
+ }
+
+ @Override
+ public void flush() {
+
+ }
+
+ public List<T> getOutput() {
+ return output;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mem/package-info.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mem/package-info.java b/crunch-core/src/main/java/org/apache/crunch/impl/mem/package-info.java
new file mode 100644
index 0000000..a55b673
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mem/package-info.java
@@ -0,0 +1,22 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * In-memory Pipeline implementation for rapid prototyping and testing.
+ */
+package org.apache.crunch.impl.mem;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/MRPipeline.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/MRPipeline.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/MRPipeline.java
new file mode 100644
index 0000000..00cf486
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/MRPipeline.java
@@ -0,0 +1,396 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.PipelineExecution;
+import org.apache.crunch.PipelineResult;
+import org.apache.crunch.Source;
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.TableSource;
+import org.apache.crunch.Target;
+import org.apache.crunch.Target.WriteMode;
+import org.apache.crunch.fn.IdentityFn;
+import org.apache.crunch.impl.mr.collect.InputCollection;
+import org.apache.crunch.impl.mr.collect.InputTable;
+import org.apache.crunch.impl.mr.collect.PCollectionImpl;
+import org.apache.crunch.impl.mr.collect.PGroupedTableImpl;
+import org.apache.crunch.impl.mr.collect.UnionCollection;
+import org.apache.crunch.impl.mr.collect.UnionTable;
+import org.apache.crunch.impl.mr.exec.MRExecutor;
+import org.apache.crunch.impl.mr.plan.MSCRPlanner;
+import org.apache.crunch.impl.mr.run.RuntimeParameters;
+import org.apache.crunch.io.From;
+import org.apache.crunch.io.ReadableSource;
+import org.apache.crunch.io.ReadableSourceTarget;
+import org.apache.crunch.io.To;
+import org.apache.crunch.materialize.MaterializableIterable;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+
+/**
+ * Pipeline implementation that is executed within Hadoop MapReduce.
+ */
+public class MRPipeline implements Pipeline {
+
+ private static final Log LOG = LogFactory.getLog(MRPipeline.class);
+
+ private static final Random RANDOM = new Random();
+
+ private final Class<?> jarClass;
+ private final String name;
+ private final Map<PCollectionImpl<?>, Set<Target>> outputTargets;
+ private final Map<PCollectionImpl<?>, MaterializableIterable<?>> outputTargetsToMaterialize;
+ private Path tempDirectory;
+ private int tempFileIndex;
+ private int nextAnonymousStageId;
+
+ private Configuration conf;
+
+ /**
+ * Instantiate with a default Configuration and name.
+ *
+ * @param jarClass Class containing the main driver method for running the pipeline
+ */
+ public MRPipeline(Class<?> jarClass) {
+ this(jarClass, new Configuration());
+ }
+
+ /**
+ * Instantiate with a custom pipeline name. The name will be displayed in the Hadoop JobTracker.
+ *
+ * @param jarClass Class containing the main driver method for running the pipeline
+ * @param name Display name of the pipeline
+ */
+ public MRPipeline(Class<?> jarClass, String name) {
+ this(jarClass, name, new Configuration());
+ }
+
+ /**
+ * Instantiate with a custom configuration and default naming.
+ *
+ * @param jarClass Class containing the main driver method for running the pipeline
+ * @param conf Configuration to be used within all MapReduce jobs run in the pipeline
+ */
+ public MRPipeline(Class<?> jarClass, Configuration conf) {
+ this(jarClass, jarClass.getName(), conf);
+ }
+
+ /**
+ * Instantiate with a custom name and configuration. The name will be displayed in the Hadoop
+ * JobTracker.
+ *
+ * @param jarClass Class containing the main driver method for running the pipeline
+ * @param name Display name of the pipeline
+ * @param conf Configuration to be used within all MapReduce jobs run in the pipeline
+ */
+ public MRPipeline(Class<?> jarClass, String name, Configuration conf) {
+ this.jarClass = jarClass;
+ this.name = name;
+ this.outputTargets = Maps.newHashMap();
+ this.outputTargetsToMaterialize = Maps.newHashMap();
+ this.conf = conf;
+ this.tempDirectory = createTempDirectory(conf);
+ this.tempFileIndex = 0;
+ this.nextAnonymousStageId = 0;
+ }
+
+ @Override
+ public Configuration getConfiguration() {
+ return conf;
+ }
+
+ @Override
+ public void setConfiguration(Configuration conf) {
+ this.conf = conf;
+ this.tempDirectory = createTempDirectory(conf);
+ }
+
+ public MRExecutor plan() {
+ Map<PCollectionImpl<?>, MaterializableIterable> toMaterialize = Maps.newHashMap();
+ for (PCollectionImpl<?> c : outputTargets.keySet()) {
+ if (outputTargetsToMaterialize.containsKey(c)) {
+ toMaterialize.put(c, outputTargetsToMaterialize.get(c));
+ outputTargetsToMaterialize.remove(c);
+ }
+ }
+ MSCRPlanner planner = new MSCRPlanner(this, outputTargets, toMaterialize);
+ try {
+ return planner.plan(jarClass, conf);
+ } catch (IOException e) {
+ throw new CrunchRuntimeException(e);
+ }
+ }
+
+ @Override
+ public PipelineResult run() {
+ try {
+ PipelineExecution pipelineExecution = runAsync();
+ pipelineExecution.waitUntilDone();
+ return pipelineExecution.getResult();
+ } catch (InterruptedException e) {
+ // TODO: How to handle this without changing signature?
+ LOG.error("Exception running pipeline", e);
+ return PipelineResult.EMPTY;
+ }
+ }
+
+ @Override
+ public PipelineExecution runAsync() {
+ PipelineExecution res = plan().execute();
+ outputTargets.clear();
+ return res;
+ }
+
+ @Override
+ public PipelineResult done() {
+ PipelineResult res = null;
+ if (!outputTargets.isEmpty()) {
+ res = run();
+ }
+ cleanup();
+ return res;
+ }
+
+ public <S> PCollection<S> read(Source<S> source) {
+ return new InputCollection<S>(source, this);
+ }
+
+ public <K, V> PTable<K, V> read(TableSource<K, V> source) {
+ return new InputTable<K, V>(source, this);
+ }
+
+ public PCollection<String> readTextFile(String pathName) {
+ return read(From.textFile(pathName));
+ }
+
+ public void write(PCollection<?> pcollection, Target target) {
+ write(pcollection, target, Target.WriteMode.DEFAULT);
+ }
+
+ @SuppressWarnings("unchecked")
+ public void write(PCollection<?> pcollection, Target target,
+ Target.WriteMode writeMode) {
+ if (pcollection instanceof PGroupedTableImpl) {
+ pcollection = ((PGroupedTableImpl<?, ?>) pcollection).ungroup();
+ } else if (pcollection instanceof UnionCollection || pcollection instanceof UnionTable) {
+ pcollection = pcollection.parallelDo("UnionCollectionWrapper",
+ (MapFn) IdentityFn.<Object> getInstance(), pcollection.getPType());
+ }
+ target.handleExisting(writeMode, getConfiguration());
+ if (writeMode != WriteMode.APPEND && targetInCurrentRun(target)) {
+ throw new CrunchRuntimeException("Target " + target + " is already written in current run." +
+ " Use WriteMode.APPEND in order to write additional data to it.");
+ }
+ addOutput((PCollectionImpl<?>) pcollection, target);
+ }
+
+ private boolean targetInCurrentRun(Target target) {
+ for (Set<Target> targets : outputTargets.values()) {
+ if (targets.contains(target)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private void addOutput(PCollectionImpl<?> impl, Target target) {
+ if (!outputTargets.containsKey(impl)) {
+ outputTargets.put(impl, Sets.<Target> newHashSet());
+ }
+ outputTargets.get(impl).add(target);
+ }
+
+ @Override
+ public <T> Iterable<T> materialize(PCollection<T> pcollection) {
+
+ PCollectionImpl<T> pcollectionImpl = toPcollectionImpl(pcollection);
+ ReadableSource<T> readableSrc = getMaterializeSourceTarget(pcollectionImpl);
+
+ MaterializableIterable<T> c = new MaterializableIterable<T>(this, readableSrc);
+ if (!outputTargetsToMaterialize.containsKey(pcollectionImpl)) {
+ outputTargetsToMaterialize.put(pcollectionImpl, c);
+ }
+ return c;
+ }
+
+ /**
+ * Retrieve a ReadableSourceTarget that provides access to the contents of a {@link PCollection}.
+ * This is primarily intended as a helper method to {@link #materialize(PCollection)}. The
+ * underlying data of the ReadableSourceTarget may not be actually present until the pipeline is
+ * run.
+ *
+ * @param pcollection The collection for which the ReadableSourceTarget is to be retrieved
+ * @return The ReadableSourceTarget
+ * @throws IllegalArgumentException If no ReadableSourceTarget can be retrieved for the given
+ * PCollection
+ */
+ public <T> ReadableSource<T> getMaterializeSourceTarget(PCollection<T> pcollection) {
+ PCollectionImpl<T> impl = toPcollectionImpl(pcollection);
+
+ // First, check to see if this is a readable input collection.
+ if (impl instanceof InputCollection) {
+ InputCollection<T> ic = (InputCollection<T>) impl;
+ if (ic.getSource() instanceof ReadableSource) {
+ return (ReadableSource) ic.getSource();
+ } else {
+ throw new IllegalArgumentException(
+ "Cannot materialize non-readable input collection: " + ic);
+ }
+ } else if (impl instanceof InputTable) {
+ InputTable it = (InputTable) impl;
+ if (it.getSource() instanceof ReadableSource) {
+ return (ReadableSource) it.getSource();
+ } else {
+ throw new IllegalArgumentException(
+ "Cannot materialize non-readable input table: " + it);
+ }
+ }
+
+ // Next, check to see if this pcollection has already been materialized.
+ SourceTarget<T> matTarget = impl.getMaterializedAt();
+ if (matTarget != null && matTarget instanceof ReadableSourceTarget) {
+ return (ReadableSourceTarget<T>) matTarget;
+ }
+
+ // Check to see if we plan on materializing this collection on the
+ // next run.
+ ReadableSourceTarget<T> srcTarget = null;
+ if (outputTargets.containsKey(pcollection)) {
+ for (Target target : outputTargets.get(impl)) {
+ if (target instanceof ReadableSourceTarget) {
+ return (ReadableSourceTarget<T>) target;
+ }
+ }
+ }
+
+ // If we're not planning on materializing it already, create a temporary
+ // output to hold the materialized records and return that.
+ SourceTarget<T> st = createIntermediateOutput(pcollection.getPType());
+ if (!(st instanceof ReadableSourceTarget)) {
+ throw new IllegalArgumentException("The PType for the given PCollection is not readable"
+ + " and cannot be materialized");
+ } else {
+ srcTarget = (ReadableSourceTarget<T>) st;
+ addOutput(impl, srcTarget);
+ return srcTarget;
+ }
+ }
+
+ /**
+ * Safely cast a PCollection into a PCollectionImpl, including handling the case of
+ * UnionCollections.
+ *
+ * @param pcollection The PCollection to be cast/transformed
+ * @return The PCollectionImpl representation
+ */
+ private <T> PCollectionImpl<T> toPcollectionImpl(PCollection<T> pcollection) {
+ PCollectionImpl<T> pcollectionImpl = null;
+ if (pcollection instanceof UnionCollection || pcollection instanceof UnionTable) {
+ pcollectionImpl = (PCollectionImpl<T>) pcollection.parallelDo("UnionCollectionWrapper",
+ (MapFn) IdentityFn.<Object> getInstance(), pcollection.getPType());
+ } else {
+ pcollectionImpl = (PCollectionImpl<T>) pcollection;
+ }
+ return pcollectionImpl;
+ }
+
+ public <T> SourceTarget<T> createIntermediateOutput(PType<T> ptype) {
+ return ptype.getDefaultFileSource(createTempPath());
+ }
+
+ public Path createTempPath() {
+ tempFileIndex++;
+ return new Path(tempDirectory, "p" + tempFileIndex);
+ }
+
+ private static Path createTempDirectory(Configuration conf) {
+ Path dir = createTemporaryPath(conf);
+ try {
+ dir.getFileSystem(conf).mkdirs(dir);
+ } catch (IOException e) {
+ throw new RuntimeException("Cannot create job output directory " + dir, e);
+ }
+ return dir;
+ }
+
+ private static Path createTemporaryPath(Configuration conf) {
+ String baseDir = conf.get(RuntimeParameters.TMP_DIR, "/tmp");
+ return new Path(baseDir, "crunch-" + (RANDOM.nextInt() & Integer.MAX_VALUE));
+ }
+
+ @Override
+ public <T> void writeTextFile(PCollection<T> pcollection, String pathName) {
+ pcollection.parallelDo("asText", new StringifyFn<T>(), Writables.strings())
+ .write(To.textFile(pathName));
+ }
+
+ private static class StringifyFn<T> extends MapFn<T, String> {
+ @Override
+ public String map(T input) {
+ return input.toString();
+ }
+ }
+
+ private void cleanup() {
+ if (!outputTargets.isEmpty()) {
+ LOG.warn("Not running cleanup while output targets remain");
+ return;
+ }
+ try {
+ FileSystem fs = tempDirectory.getFileSystem(conf);
+ if (fs.exists(tempDirectory)) {
+ fs.delete(tempDirectory, true);
+ }
+ } catch (IOException e) {
+ LOG.info("Exception during cleanup", e);
+ }
+ }
+
+ public int getNextAnonymousStageId() {
+ return nextAnonymousStageId++;
+ }
+
+ @Override
+ public void enableDebug() {
+ // Turn on Crunch runtime error catching.
+ getConfiguration().setBoolean(RuntimeParameters.DEBUG, true);
+ }
+
+ @Override
+ public String getName() {
+ return name;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/DoCollectionImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/DoCollectionImpl.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/DoCollectionImpl.java
new file mode 100644
index 0000000..7b8f2ea
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/DoCollectionImpl.java
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.collect;
+
+import java.util.List;
+import java.util.Set;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.ParallelDoOptions;
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.impl.mr.plan.DoNode;
+import org.apache.crunch.types.PType;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+
+public class DoCollectionImpl<S> extends PCollectionImpl<S> {
+
+ private final PCollectionImpl<Object> parent;
+ private final DoFn<Object, S> fn;
+ private final PType<S> ntype;
+
+ <T> DoCollectionImpl(String name, PCollectionImpl<T> parent, DoFn<T, S> fn, PType<S> ntype) {
+ this(name, parent, fn, ntype, ParallelDoOptions.builder().build());
+ }
+
+ <T> DoCollectionImpl(String name, PCollectionImpl<T> parent, DoFn<T, S> fn, PType<S> ntype,
+ ParallelDoOptions options) {
+ super(name, options);
+ this.parent = (PCollectionImpl<Object>) parent;
+ this.fn = (DoFn<Object, S>) fn;
+ this.ntype = ntype;
+ }
+
+ @Override
+ protected long getSizeInternal() {
+ return (long) (fn.scaleFactor() * parent.getSize());
+ }
+
+ @Override
+ public PType<S> getPType() {
+ return ntype;
+ }
+
+ @Override
+ protected void acceptInternal(PCollectionImpl.Visitor visitor) {
+ visitor.visitDoFnCollection(this);
+ }
+
+ @Override
+ public List<PCollectionImpl<?>> getParents() {
+ return ImmutableList.<PCollectionImpl<?>> of(parent);
+ }
+
+ @Override
+ public DoNode createDoNode() {
+ return DoNode.createFnNode(getName(), fn, ntype);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/DoTableImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/DoTableImpl.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/DoTableImpl.java
new file mode 100644
index 0000000..176643b
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/DoTableImpl.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.collect;
+
+import java.util.List;
+
+import org.apache.crunch.CombineFn;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.ParallelDoOptions;
+import org.apache.crunch.impl.mr.plan.DoNode;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+
+import com.google.common.collect.ImmutableList;
+
+public class DoTableImpl<K, V> extends PTableBase<K, V> implements PTable<K, V> {
+
+ private final PCollectionImpl<?> parent;
+ private final DoFn<?, Pair<K, V>> fn;
+ private final PTableType<K, V> type;
+
+ <S> DoTableImpl(String name, PCollectionImpl<S> parent, DoFn<S, Pair<K, V>> fn, PTableType<K, V> ntype) {
+ this(name, parent, fn, ntype, ParallelDoOptions.builder().build());
+ }
+
+ <S> DoTableImpl(String name, PCollectionImpl<S> parent, DoFn<S, Pair<K, V>> fn, PTableType<K, V> ntype,
+ ParallelDoOptions options) {
+ super(name, options);
+ this.parent = parent;
+ this.fn = fn;
+ this.type = ntype;
+ }
+
+ @Override
+ protected long getSizeInternal() {
+ return (long) (fn.scaleFactor() * parent.getSize());
+ }
+
+ @Override
+ public PTableType<K, V> getPTableType() {
+ return type;
+ }
+
+ @Override
+ protected void acceptInternal(PCollectionImpl.Visitor visitor) {
+ visitor.visitDoTable(this);
+ }
+
+ @Override
+ public PType<Pair<K, V>> getPType() {
+ return type;
+ }
+
+ @Override
+ public List<PCollectionImpl<?>> getParents() {
+ return ImmutableList.<PCollectionImpl<?>> of(parent);
+ }
+
+ @Override
+ public DoNode createDoNode() {
+ return DoNode.createFnNode(getName(), fn, type);
+ }
+
+ public boolean hasCombineFn() {
+ return fn instanceof CombineFn;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/InputCollection.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/InputCollection.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/InputCollection.java
new file mode 100644
index 0000000..ace5cc1
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/InputCollection.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.collect;
+
+import java.util.List;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.crunch.Source;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.impl.mr.plan.DoNode;
+import org.apache.crunch.types.PType;
+
+import com.google.common.collect.ImmutableList;
+
+public class InputCollection<S> extends PCollectionImpl<S> {
+
+ private final Source<S> source;
+
+ public InputCollection(Source<S> source, MRPipeline pipeline) {
+ super(source.toString());
+ this.source = source;
+ this.pipeline = pipeline;
+ }
+
+ @Override
+ public PType<S> getPType() {
+ return source.getType();
+ }
+
+ public Source<S> getSource() {
+ return source;
+ }
+
+ @Override
+ protected long getSizeInternal() {
+ long sz = source.getSize(pipeline.getConfiguration());
+ if (sz < 0) {
+ throw new IllegalStateException("Input source " + source + " does not exist!");
+ }
+ return sz;
+ }
+
+ @Override
+ protected void acceptInternal(PCollectionImpl.Visitor visitor) {
+ visitor.visitInputCollection(this);
+ }
+
+ @Override
+ public List<PCollectionImpl<?>> getParents() {
+ return ImmutableList.of();
+ }
+
+ @Override
+ public DoNode createDoNode() {
+ return DoNode.createInputNode(source);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || !(obj instanceof InputCollection)) {
+ return false;
+ }
+ return source.equals(((InputCollection) obj).source);
+ }
+
+ @Override
+ public int hashCode() {
+ return new HashCodeBuilder().append(source).toHashCode();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/InputTable.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/InputTable.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/InputTable.java
new file mode 100644
index 0000000..71f11c5
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/collect/InputTable.java
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.collect;
+
+import java.util.List;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.TableSource;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.impl.mr.plan.DoNode;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+
+import com.google.common.collect.ImmutableList;
+
+public class InputTable<K, V> extends PTableBase<K, V> {
+
+ private final TableSource<K, V> source;
+ private final InputCollection<Pair<K, V>> asCollection;
+
+ public InputTable(TableSource<K, V> source, MRPipeline pipeline) {
+ super(source.toString());
+ this.source = source;
+ this.pipeline = pipeline;
+ this.asCollection = new InputCollection<Pair<K, V>>(source, pipeline);
+ }
+
+ public TableSource<K, V> getSource() {
+ return source;
+ }
+
+ @Override
+ protected long getSizeInternal() {
+ return asCollection.getSizeInternal();
+ }
+
+ @Override
+ public PTableType<K, V> getPTableType() {
+ return source.getTableType();
+ }
+
+ @Override
+ public PType<Pair<K, V>> getPType() {
+ return source.getType();
+ }
+
+ @Override
+ public List<PCollectionImpl<?>> getParents() {
+ return ImmutableList.of();
+ }
+
+ @Override
+ protected void acceptInternal(PCollectionImpl.Visitor visitor) {
+ visitor.visitInputCollection(asCollection);
+ }
+
+ @Override
+ public DoNode createDoNode() {
+ return DoNode.createInputNode(source);
+ }
+
+ @Override
+ public int hashCode() {
+ return asCollection.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return asCollection.equals(other);
+ }
+}
[38/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/org/apache/crunch/UnionITData/src1.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/org/apache/crunch/UnionITData/src1.txt b/crunch-core/src/it/resources/org/apache/crunch/UnionITData/src1.txt
new file mode 100644
index 0000000..a92974b
--- /dev/null
+++ b/crunch-core/src/it/resources/org/apache/crunch/UnionITData/src1.txt
@@ -0,0 +1,5 @@
+a1
+b2
+a1
+a1
+b2
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/org/apache/crunch/UnionITData/src2.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/org/apache/crunch/UnionITData/src2.txt b/crunch-core/src/it/resources/org/apache/crunch/UnionITData/src2.txt
new file mode 100644
index 0000000..9363398
--- /dev/null
+++ b/crunch-core/src/it/resources/org/apache/crunch/UnionITData/src2.txt
@@ -0,0 +1,3 @@
+c3
+a1
+c3
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/org/apache/crunch/fn/AggregatorsITData/ints.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/org/apache/crunch/fn/AggregatorsITData/ints.txt b/crunch-core/src/it/resources/org/apache/crunch/fn/AggregatorsITData/ints.txt
new file mode 100644
index 0000000..680cb09
--- /dev/null
+++ b/crunch-core/src/it/resources/org/apache/crunch/fn/AggregatorsITData/ints.txt
@@ -0,0 +1,5 @@
+a 1 2
+a 3 4
+b 2 3
+a 5 6
+b 9 10
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/org/apache/crunch/lib/CogroupITData/src1.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/org/apache/crunch/lib/CogroupITData/src1.txt b/crunch-core/src/it/resources/org/apache/crunch/lib/CogroupITData/src1.txt
new file mode 100644
index 0000000..9f38eb9
--- /dev/null
+++ b/crunch-core/src/it/resources/org/apache/crunch/lib/CogroupITData/src1.txt
@@ -0,0 +1,4 @@
+a,1-1
+b,1-2
+c,1-3
+a,1-4
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/org/apache/crunch/lib/CogroupITData/src2.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/org/apache/crunch/lib/CogroupITData/src2.txt b/crunch-core/src/it/resources/org/apache/crunch/lib/CogroupITData/src2.txt
new file mode 100644
index 0000000..ed9524e
--- /dev/null
+++ b/crunch-core/src/it/resources/org/apache/crunch/lib/CogroupITData/src2.txt
@@ -0,0 +1,4 @@
+b,2-1
+c,2-2
+c,2-3
+d,2-4
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/secondary_sort_input.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/secondary_sort_input.txt b/crunch-core/src/it/resources/secondary_sort_input.txt
new file mode 100644
index 0000000..3c7be93
--- /dev/null
+++ b/crunch-core/src/it/resources/secondary_sort_input.txt
@@ -0,0 +1,7 @@
+one,1,1
+one,2,-3
+two,4,5
+two,2,6
+two,1,7,9
+three,0,-1
+one,-5,10
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/set1.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/set1.txt b/crunch-core/src/it/resources/set1.txt
new file mode 100644
index 0000000..3b67f57
--- /dev/null
+++ b/crunch-core/src/it/resources/set1.txt
@@ -0,0 +1,4 @@
+b
+c
+a
+e
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/set2.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/set2.txt b/crunch-core/src/it/resources/set2.txt
new file mode 100644
index 0000000..8169ab5
--- /dev/null
+++ b/crunch-core/src/it/resources/set2.txt
@@ -0,0 +1,3 @@
+c
+d
+a
\ No newline at end of file
[15/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/urls.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/urls.txt b/crunch/src/it/resources/urls.txt
deleted file mode 100644
index 827e711..0000000
--- a/crunch/src/it/resources/urls.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-www.A.com www.B.com
-www.A.com www.C.com
-www.A.com www.D.com
-www.A.com www.E.com
-www.B.com www.D.com
-www.B.com www.E.com
-www.C.com www.D.com
-www.D.com www.B.com
-www.E.com www.A.com
-www.F.com www.B.com
-www.F.com www.C.com
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/Aggregator.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/Aggregator.java b/crunch/src/main/java/org/apache/crunch/Aggregator.java
deleted file mode 100644
index 432452b..0000000
--- a/crunch/src/main/java/org/apache/crunch/Aggregator.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import java.io.Serializable;
-
-import org.apache.hadoop.conf.Configuration;
-
-
-/**
- * Aggregate a sequence of values into a possibly smaller sequence of the same type.
- *
- * <p>In most cases, an Aggregator will turn multiple values into a single value,
- * like creating a sum, finding the minimum or maximum, etc. In some cases
- * (ie. finding the top K elements), an implementation may return more than
- * one value. The {@link org.apache.crunch.fn.Aggregators} utility class contains
- * factory methods for creating all kinds of pre-defined Aggregators that should
- * cover the most common cases.</p>
- *
- * <p>Aggregator implementations should usually be <em>associative</em> and
- * <em>commutative</em>, which makes their results deterministic. If your aggregation
- * function isn't commutative, you can still use secondary sort to that effect.</p>
- *
- * <p>The lifecycle of an {@link Aggregator} always begins with you instantiating
- * it and passing it to Crunch. When running your {@link Pipeline}, Crunch serializes
- * the instance and deserializes it wherever it is needed on the cluster. This is how
- * Crunch uses a deserialized instance:<p>
- *
- * <ol>
- * <li>call {@link #initialize(Configuration)} once</li>
- * <li>call {@link #reset()}
- * <li>call {@link #update(Object)} multiple times until all values of a sequence
- * have been aggregated</li>
- * <li>call {@link #results()} to retrieve the aggregated result</li>
- * <li>go back to step 2 until all sequences have been aggregated</li>
- * </ol>
- *
- * @param <T> The value types to aggregate
- */
-public interface Aggregator<T> extends Serializable {
-
- /**
- * Perform any setup of this instance that is required prior to processing
- * inputs.
- *
- * @param conf Hadoop configuration
- */
- void initialize(Configuration conf);
-
- /**
- * Clears the internal state of this Aggregator and prepares it for the
- * values associated with the next key.
- *
- * Depending on what you aggregate, this typically means setting a variable
- * to zero or clearing a list. Failing to do this will yield wrong results!
- */
- void reset();
-
- /**
- * Incorporate the given value into the aggregate state maintained by this
- * instance.
- *
- * @param value The value to add to the aggregated state
- */
- void update(T value);
-
- /**
- * Returns the current aggregated state of this instance.
- */
- Iterable<T> results();
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/CombineFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/CombineFn.java b/crunch/src/main/java/org/apache/crunch/CombineFn.java
deleted file mode 100644
index 71e8057..0000000
--- a/crunch/src/main/java/org/apache/crunch/CombineFn.java
+++ /dev/null
@@ -1,1211 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import java.io.Serializable;
-import java.math.BigInteger;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.SortedSet;
-
-import org.apache.crunch.fn.Aggregators;
-import org.apache.crunch.util.Tuples;
-import org.apache.hadoop.conf.Configuration;
-
-import com.google.common.base.Joiner;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-
-/**
- * A special {@link DoFn} implementation that converts an {@link Iterable} of
- * values into a single value. If a {@code CombineFn} instance is used on a
- * {@link PGroupedTable}, the function will be applied to the output of the map
- * stage before the data is passed to the reducer, which can improve the runtime
- * of certain classes of jobs.
- * <p>
- * Note that the incoming {@code Iterable} can only be used to create an
- * {@code Iterator} once. Calling {@link Iterable#iterator()} method a second
- * time will throw an {@link IllegalStateException}.
- */
-public abstract class CombineFn<S, T> extends DoFn<Pair<S, Iterable<T>>, Pair<S, T>> {
-
- /**
- * @deprecated Use {@link org.apache.crunch.Aggregator}
- */
- public static interface Aggregator<T> extends Serializable {
- /**
- * Perform any setup of this instance that is required prior to processing
- * inputs.
- */
- void initialize(Configuration configuration);
-
- /**
- * Clears the internal state of this Aggregator and prepares it for the
- * values associated with the next key.
- */
- void reset();
-
- /**
- * Incorporate the given value into the aggregate state maintained by this
- * instance.
- */
- void update(T value);
-
- /**
- * Returns the current aggregated state of this instance.
- */
- Iterable<T> results();
- }
-
- /**
- * Base class for aggregators that do not require any initialization.
- *
- * @deprecated Use {@link org.apache.crunch.fn.Aggregators.SimpleAggregator}
- */
- public static abstract class SimpleAggregator<T> implements Aggregator<T> {
- @Override
- public void initialize(Configuration conf) {
- // No-op
- }
- }
-
- /**
- * Interface for constructing new aggregator instances.
- *
- * @deprecated Use {@link PGroupedTable#combineValues(Aggregator)} which doesn't require a factory.
- */
- public static interface AggregatorFactory<T> {
- Aggregator<T> create();
- }
-
- /**
- * A {@code CombineFn} that delegates all of the actual work to an
- * {@code Aggregator} instance.
- *
- * @deprecated Use the {@link Aggregators#toCombineFn(org.apache.crunch.Aggregator)} adapter
- */
- public static class AggregatorCombineFn<K, V> extends CombineFn<K, V> {
-
- private final Aggregator<V> aggregator;
-
- public AggregatorCombineFn(Aggregator<V> aggregator) {
- this.aggregator = aggregator;
- }
-
- @Override
- public void initialize() {
- aggregator.initialize(getConfiguration());
- }
-
- @Override
- public void process(Pair<K, Iterable<V>> input, Emitter<Pair<K, V>> emitter) {
- aggregator.reset();
- for (V v : input.second()) {
- aggregator.update(v);
- }
- for (V v : aggregator.results()) {
- emitter.emit(Pair.of(input.first(), v));
- }
- }
- }
-
- private static abstract class TupleAggregator<T> implements Aggregator<T> {
- private final List<Aggregator<Object>> aggregators;
-
- public TupleAggregator(Aggregator<?>... aggregators) {
- this.aggregators = Lists.newArrayList();
- for (Aggregator<?> a : aggregators) {
- this.aggregators.add((Aggregator<Object>) a);
- }
- }
-
- @Override
- public void initialize(Configuration configuration) {
- for (Aggregator<?> a : aggregators) {
- a.initialize(configuration);
- }
- }
-
- @Override
- public void reset() {
- for (Aggregator<?> a : aggregators) {
- a.reset();
- }
- }
-
- protected void updateTuple(Tuple t) {
- for (int i = 0; i < aggregators.size(); i++) {
- aggregators.get(i).update(t.get(i));
- }
- }
-
- protected Iterable<Object> results(int index) {
- return aggregators.get(index).results();
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#pairAggregator(Aggregator, Aggregator)}
- */
- public static class PairAggregator<V1, V2> extends TupleAggregator<Pair<V1, V2>> {
-
- public PairAggregator(Aggregator<V1> a1, Aggregator<V2> a2) {
- super(a1, a2);
- }
-
- @Override
- public void update(Pair<V1, V2> value) {
- updateTuple(value);
- }
-
- @Override
- public Iterable<Pair<V1, V2>> results() {
- return new Tuples.PairIterable<V1, V2>((Iterable<V1>) results(0), (Iterable<V2>) results(1));
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#tripAggregator(Aggregator, Aggregator, Aggregator)}
- */
- public static class TripAggregator<A, B, C> extends TupleAggregator<Tuple3<A, B, C>> {
-
- public TripAggregator(Aggregator<A> a1, Aggregator<B> a2, Aggregator<C> a3) {
- super(a1, a2, a3);
- }
-
- @Override
- public void update(Tuple3<A, B, C> value) {
- updateTuple(value);
- }
-
- @Override
- public Iterable<Tuple3<A, B, C>> results() {
- return new Tuples.TripIterable<A, B, C>((Iterable<A>) results(0), (Iterable<B>) results(1),
- (Iterable<C>) results(2));
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#quadAggregator(Aggregator, Aggregator, Aggregator, Aggregator)}
- */
- public static class QuadAggregator<A, B, C, D> extends TupleAggregator<Tuple4<A, B, C, D>> {
-
- public QuadAggregator(Aggregator<A> a1, Aggregator<B> a2, Aggregator<C> a3, Aggregator<D> a4) {
- super(a1, a2, a3, a4);
- }
-
- @Override
- public void update(Tuple4<A, B, C, D> value) {
- updateTuple(value);
- }
-
- @Override
- public Iterable<Tuple4<A, B, C, D>> results() {
- return new Tuples.QuadIterable<A, B, C, D>((Iterable<A>) results(0), (Iterable<B>) results(1),
- (Iterable<C>) results(2), (Iterable<D>) results(3));
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#tupleAggregator(Aggregator...)}
- */
- public static class TupleNAggregator extends TupleAggregator<TupleN> {
-
- private final int size;
-
- public TupleNAggregator(Aggregator<?>... aggregators) {
- super(aggregators);
- size = aggregators.length;
- }
-
- @Override
- public void update(TupleN value) {
- updateTuple(value);
- }
-
- @Override
- public Iterable<TupleN> results() {
- Iterable<?>[] iterables = new Iterable[size];
- for (int i = 0; i < size; i++) {
- iterables[i] = results(i);
- }
- return new Tuples.TupleNIterable(iterables);
- }
-
- }
-
- /**
- * @deprecated Use {@link Aggregators#toCombineFn(Aggregator)}
- */
- public static final <K, V> CombineFn<K, V> aggregator(Aggregator<V> aggregator) {
- return new AggregatorCombineFn<K, V>(aggregator);
- }
-
- /**
- * @deprecated Use {@link PGroupedTable#combineValues(Aggregator)} which doesn't require a factory.
- */
- public static final <K, V> CombineFn<K, V> aggregatorFactory(AggregatorFactory<V> aggregator) {
- return new AggregatorCombineFn<K, V>(aggregator.create());
- }
-
- /**
- * @deprecated Use {@link Aggregators#pairAggregator(Aggregator, Aggregator)}
- */
- public static final <K, V1, V2> CombineFn<K, Pair<V1, V2>> pairAggregator(AggregatorFactory<V1> a1,
- AggregatorFactory<V2> a2) {
- return aggregator(new PairAggregator<V1, V2>(a1.create(), a2.create()));
- }
-
- /**
- * @deprecated Use {@link Aggregators#tripAggregator(Aggregator, Aggregator, Aggregator)}
- */
- public static final <K, A, B, C> CombineFn<K, Tuple3<A, B, C>> tripAggregator(AggregatorFactory<A> a1,
- AggregatorFactory<B> a2, AggregatorFactory<C> a3) {
- return aggregator(new TripAggregator<A, B, C>(a1.create(), a2.create(), a3.create()));
- }
-
- /**
- * @deprecated Use {@link Aggregators#quadAggregator(Aggregator, Aggregator, Aggregator, Aggregator)}
- */
- public static final <K, A, B, C, D> CombineFn<K, Tuple4<A, B, C, D>> quadAggregator(AggregatorFactory<A> a1,
- AggregatorFactory<B> a2, AggregatorFactory<C> a3, AggregatorFactory<D> a4) {
- return aggregator(new QuadAggregator<A, B, C, D>(a1.create(), a2.create(), a3.create(), a4.create()));
- }
-
- /**
- * @deprecated Use {@link Aggregators#tupleAggregator(Aggregator...)}
- */
- public static final <K> CombineFn<K, TupleN> tupleAggregator(AggregatorFactory<?>... factories) {
- Aggregator<?>[] aggs = new Aggregator[factories.length];
- for (int i = 0; i < aggs.length; i++) {
- aggs[i] = factories[i].create();
- }
- return aggregator(new TupleNAggregator(aggs));
- }
-
- /**
- * @deprecated Use {@link Aggregators#SUM_LONGS()}
- */
- public static final <K> CombineFn<K, Long> SUM_LONGS() {
- return aggregatorFactory(SUM_LONGS);
- }
-
- /**
- * @deprecated Use {@link Aggregators#SUM_INTS()}
- */
- public static final <K> CombineFn<K, Integer> SUM_INTS() {
- return aggregatorFactory(SUM_INTS);
- }
-
- /**
- * @deprecated Use {@link Aggregators#SUM_FLOATS()}
- */
- public static final <K> CombineFn<K, Float> SUM_FLOATS() {
- return aggregatorFactory(SUM_FLOATS);
- }
-
- /**
- * @deprecated Use {@link Aggregators#SUM_DOUBLES()}
- */
- public static final <K> CombineFn<K, Double> SUM_DOUBLES() {
- return aggregatorFactory(SUM_DOUBLES);
- }
-
- /**
- * @deprecated Use {@link Aggregators#SUM_BIGINTS()}
- */
- public static final <K> CombineFn<K, BigInteger> SUM_BIGINTS() {
- return aggregatorFactory(SUM_BIGINTS);
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_LONGS()}
- */
- public static final <K> CombineFn<K, Long> MAX_LONGS() {
- return aggregatorFactory(MAX_LONGS);
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_LONGS(int)}
- */
- public static final <K> CombineFn<K, Long> MAX_LONGS(int n) {
- return aggregator(new MaxNAggregator<Long>(n));
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_INTS()}
- */
- public static final <K> CombineFn<K, Integer> MAX_INTS() {
- return aggregatorFactory(MAX_INTS);
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_INTS(int)}
- */
- public static final <K> CombineFn<K, Integer> MAX_INTS(int n) {
- return aggregator(new MaxNAggregator<Integer>(n));
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_FLOATS()}
- */
- public static final <K> CombineFn<K, Float> MAX_FLOATS() {
- return aggregatorFactory(MAX_FLOATS);
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_FLOATS(int)}
- */
- public static final <K> CombineFn<K, Float> MAX_FLOATS(int n) {
- return aggregator(new MaxNAggregator<Float>(n));
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_DOUBLES()}
- */
- public static final <K> CombineFn<K, Double> MAX_DOUBLES() {
- return aggregatorFactory(MAX_DOUBLES);
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_DOUBLES(int)}
- */
- public static final <K> CombineFn<K, Double> MAX_DOUBLES(int n) {
- return aggregator(new MaxNAggregator<Double>(n));
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_BIGINTS()}
- */
- public static final <K> CombineFn<K, BigInteger> MAX_BIGINTS() {
- return aggregatorFactory(MAX_BIGINTS);
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_BIGINTS(int)}
- */
- public static final <K> CombineFn<K, BigInteger> MAX_BIGINTS(int n) {
- return aggregator(new MaxNAggregator<BigInteger>(n));
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_LONGS()}
- */
- public static final <K> CombineFn<K, Long> MIN_LONGS() {
- return aggregatorFactory(MIN_LONGS);
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_LONGS(int)}
- */
- public static final <K> CombineFn<K, Long> MIN_LONGS(int n) {
- return aggregator(new MinNAggregator<Long>(n));
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_INTS()}
- */
- public static final <K> CombineFn<K, Integer> MIN_INTS() {
- return aggregatorFactory(MIN_INTS);
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_INTS(int)}
- */
- public static final <K> CombineFn<K, Integer> MIN_INTS(int n) {
- return aggregator(new MinNAggregator<Integer>(n));
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_FLOATS()}
- */
- public static final <K> CombineFn<K, Float> MIN_FLOATS() {
- return aggregatorFactory(MIN_FLOATS);
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_FLOATS(int)}
- */
- public static final <K> CombineFn<K, Float> MIN_FLOATS(int n) {
- return aggregator(new MinNAggregator<Float>(n));
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_DOUBLES()}
- */
- public static final <K> CombineFn<K, Double> MIN_DOUBLES() {
- return aggregatorFactory(MIN_DOUBLES);
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_DOUBLES(int)}
- */
- public static final <K> CombineFn<K, Double> MIN_DOUBLES(int n) {
- return aggregator(new MinNAggregator<Double>(n));
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_BIGINTS()}
- */
- public static final <K> CombineFn<K, BigInteger> MIN_BIGINTS() {
- return aggregatorFactory(MIN_BIGINTS);
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_BIGINTS(int)}
- */
- public static final <K> CombineFn<K, BigInteger> MIN_BIGINTS(int n) {
- return aggregator(new MinNAggregator<BigInteger>(n));
- }
-
- /**
- * @deprecated Use {@link Aggregators#FIRST_N(int)}
- */
- public static final <K, V> CombineFn<K, V> FIRST_N(int n) {
- return aggregator(new FirstNAggregator<V>(n));
- }
-
- /**
- * @deprecated Use {@link Aggregators#LAST_N(int)}
- */
- public static final <K, V> CombineFn<K, V> LAST_N(int n) {
- return aggregator(new LastNAggregator<V>(n));
- }
-
- /**
- * Used to concatenate strings, with a separator between each strings. There
- * is no limits of length for the concatenated string.
- *
- * @param separator
- * the separator which will be appended between each string
- * @param skipNull
- * define if we should skip null values. Throw
- * NullPointerException if set to false and there is a null
- * value.
- * @return
- *
- * @deprecated Use {@link Aggregators#STRING_CONCAT(String, boolean)}
- */
- public static final <K> CombineFn<K, String> STRING_CONCAT(final String separator, final boolean skipNull) {
- return aggregator(new StringConcatAggregator(separator, skipNull));
- }
-
- /**
- * Used to concatenate strings, with a separator between each strings. You
- * can specify the maximum length of the output string and of the input
- * strings, if they are > 0. If a value is <= 0, there is no limits.
- *
- * Any too large string (or any string which would made the output too
- * large) will be silently discarded.
- *
- * @param separator
- * the separator which will be appended between each string
- * @param skipNull
- * define if we should skip null values. Throw
- * NullPointerException if set to false and there is a null
- * value.
- * @param maxOutputLength
- * the maximum length of the output string. If it's set <= 0,
- * there is no limits. The number of characters of the output
- * string will be < maxOutputLength.
- * @param maxInputLength
- * the maximum length of the input strings. If it's set <= 0,
- * there is no limits. The number of characters of the int string
- * will be < maxInputLength to be concatenated.
- * @return
- *
- * @deprecated Use {@link Aggregators#STRING_CONCAT(String, boolean, long, long)}
- */
- public static final <K> CombineFn<K, String> STRING_CONCAT(final String separator, final boolean skipNull, final long maxOutputLength, final long maxInputLength) {
- return aggregator(new StringConcatAggregator(separator, skipNull, maxOutputLength, maxInputLength));
- }
-
- /**
- * @deprecated Use {@link Aggregators#SUM_LONGS()}
- */
- public static class SumLongs extends SimpleAggregator<Long> {
- private long sum = 0;
-
- @Override
- public void reset() {
- sum = 0;
- }
-
- @Override
- public void update(Long next) {
- sum += next;
- }
-
- @Override
- public Iterable<Long> results() {
- return ImmutableList.of(sum);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#SUM_LONGS()}
- */
- public static AggregatorFactory<Long> SUM_LONGS = new AggregatorFactory<Long>() {
- public Aggregator<Long> create() {
- return new SumLongs();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#SUM_INTS()}
- */
- public static class SumInts extends SimpleAggregator<Integer> {
- private int sum = 0;
-
- @Override
- public void reset() {
- sum = 0;
- }
-
- @Override
- public void update(Integer next) {
- sum += next;
- }
-
- @Override
- public Iterable<Integer> results() {
- return ImmutableList.of(sum);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#SUM_INTS()}
- */
- public static AggregatorFactory<Integer> SUM_INTS = new AggregatorFactory<Integer>() {
- public Aggregator<Integer> create() {
- return new SumInts();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#SUM_FLOATS()}
- */
- public static class SumFloats extends SimpleAggregator<Float> {
- private float sum = 0;
-
- @Override
- public void reset() {
- sum = 0f;
- }
-
- @Override
- public void update(Float next) {
- sum += next;
- }
-
- @Override
- public Iterable<Float> results() {
- return ImmutableList.of(sum);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#SUM_FLOATS()}
- */
- public static AggregatorFactory<Float> SUM_FLOATS = new AggregatorFactory<Float>() {
- public Aggregator<Float> create() {
- return new SumFloats();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#SUM_DOUBLES()}
- */
- public static class SumDoubles extends SimpleAggregator<Double> {
- private double sum = 0;
-
- @Override
- public void reset() {
- sum = 0f;
- }
-
- @Override
- public void update(Double next) {
- sum += next;
- }
-
- @Override
- public Iterable<Double> results() {
- return ImmutableList.of(sum);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#SUM_DOUBLES()}
- */
- public static AggregatorFactory<Double> SUM_DOUBLES = new AggregatorFactory<Double>() {
- public Aggregator<Double> create() {
- return new SumDoubles();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#SUM_BIGINTS()}
- */
- public static class SumBigInts extends SimpleAggregator<BigInteger> {
- private BigInteger sum = BigInteger.ZERO;
-
- @Override
- public void reset() {
- sum = BigInteger.ZERO;
- }
-
- @Override
- public void update(BigInteger next) {
- sum = sum.add(next);
- }
-
- @Override
- public Iterable<BigInteger> results() {
- return ImmutableList.of(sum);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#SUM_BIGINTS()}
- */
- public static AggregatorFactory<BigInteger> SUM_BIGINTS = new AggregatorFactory<BigInteger>() {
- public Aggregator<BigInteger> create() {
- return new SumBigInts();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#MAX_LONGS()}
- */
- public static class MaxLongs extends SimpleAggregator<Long> {
- private Long max = null;
-
- @Override
- public void reset() {
- max = null;
- }
-
- @Override
- public void update(Long next) {
- if (max == null || max < next) {
- max = next;
- }
- }
-
- @Override
- public Iterable<Long> results() {
- return ImmutableList.of(max);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_LONGS()}
- */
- public static AggregatorFactory<Long> MAX_LONGS = new AggregatorFactory<Long>() {
- public Aggregator<Long> create() {
- return new MaxLongs();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#MAX_INTS()}
- */
- public static class MaxInts extends SimpleAggregator<Integer> {
- private Integer max = null;
-
- @Override
- public void reset() {
- max = null;
- }
-
- @Override
- public void update(Integer next) {
- if (max == null || max < next) {
- max = next;
- }
- }
-
- @Override
- public Iterable<Integer> results() {
- return ImmutableList.of(max);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_INTS()}
- */
- public static AggregatorFactory<Integer> MAX_INTS = new AggregatorFactory<Integer>() {
- public Aggregator<Integer> create() {
- return new MaxInts();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#MAX_FLOATS()}
- */
- public static class MaxFloats extends SimpleAggregator<Float> {
- private Float max = null;
-
- @Override
- public void reset() {
- max = null;
- }
-
- @Override
- public void update(Float next) {
- if (max == null || max < next) {
- max = next;
- }
- }
-
- @Override
- public Iterable<Float> results() {
- return ImmutableList.of(max);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_FLOATS()}
- */
- public static AggregatorFactory<Float> MAX_FLOATS = new AggregatorFactory<Float>() {
- public Aggregator<Float> create() {
- return new MaxFloats();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#MAX_DOUBLES()}
- */
- public static class MaxDoubles extends SimpleAggregator<Double> {
- private Double max = null;
-
- @Override
- public void reset() {
- max = null;
- }
-
- @Override
- public void update(Double next) {
- if (max == null || max < next) {
- max = next;
- }
- }
-
- @Override
- public Iterable<Double> results() {
- return ImmutableList.of(max);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_DOUBLES()}
- */
- public static AggregatorFactory<Double> MAX_DOUBLES = new AggregatorFactory<Double>() {
- public Aggregator<Double> create() {
- return new MaxDoubles();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#MAX_BIGINTS()}
- */
- public static class MaxBigInts extends SimpleAggregator<BigInteger> {
- private BigInteger max = null;
-
- @Override
- public void reset() {
- max = null;
- }
-
- @Override
- public void update(BigInteger next) {
- if (max == null || max.compareTo(next) < 0) {
- max = next;
- }
- }
-
- @Override
- public Iterable<BigInteger> results() {
- return ImmutableList.of(max);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#MAX_BIGINTS()}
- */
- public static AggregatorFactory<BigInteger> MAX_BIGINTS = new AggregatorFactory<BigInteger>() {
- public Aggregator<BigInteger> create() {
- return new MaxBigInts();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#MIN_LONGS()}
- */
- public static class MinLongs extends SimpleAggregator<Long> {
- private Long min = null;
-
- @Override
- public void reset() {
- min = null;
- }
-
- @Override
- public void update(Long next) {
- if (min == null || min > next) {
- min = next;
- }
- }
-
- @Override
- public Iterable<Long> results() {
- return ImmutableList.of(min);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_LONGS()}
- */
- public static AggregatorFactory<Long> MIN_LONGS = new AggregatorFactory<Long>() {
- public Aggregator<Long> create() {
- return new MinLongs();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#MIN_INTS()}
- */
- public static class MinInts extends SimpleAggregator<Integer> {
- private Integer min = null;
-
- @Override
- public void reset() {
- min = null;
- }
-
- @Override
- public void update(Integer next) {
- if (min == null || min > next) {
- min = next;
- }
- }
-
- @Override
- public Iterable<Integer> results() {
- return ImmutableList.of(min);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_INTS()}
- */
- public static AggregatorFactory<Integer> MIN_INTS = new AggregatorFactory<Integer>() {
- public Aggregator<Integer> create() {
- return new MinInts();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#MIN_FLOATS()}
- */
- public static class MinFloats extends SimpleAggregator<Float> {
- private Float min = null;
-
- @Override
- public void reset() {
- min = null;
- }
-
- @Override
- public void update(Float next) {
- if (min == null || min > next) {
- min = next;
- }
- }
-
- @Override
- public Iterable<Float> results() {
- return ImmutableList.of(min);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_FLOATS()}
- */
- public static AggregatorFactory<Float> MIN_FLOATS = new AggregatorFactory<Float>() {
- public Aggregator<Float> create() {
- return new MinFloats();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#MIN_DOUBLES()}
- */
- public static class MinDoubles extends SimpleAggregator<Double> {
- private Double min = null;
-
- @Override
- public void reset() {
- min = null;
- }
-
- @Override
- public void update(Double next) {
- if (min == null || min > next) {
- min = next;
- }
- }
-
- @Override
- public Iterable<Double> results() {
- return ImmutableList.of(min);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_DOUBLES()}
- */
- public static AggregatorFactory<Double> MIN_DOUBLES = new AggregatorFactory<Double>() {
- public Aggregator<Double> create() {
- return new MinDoubles();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#MIN_BIGINTS()}
- */
- public static class MinBigInts extends SimpleAggregator<BigInteger> {
- private BigInteger min = null;
-
- @Override
- public void reset() {
- min = null;
- }
-
- @Override
- public void update(BigInteger next) {
- if (min == null || min.compareTo(next) > 0) {
- min = next;
- }
- }
-
- @Override
- public Iterable<BigInteger> results() {
- return ImmutableList.of(min);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_BIGINTS()}
- */
- public static AggregatorFactory<BigInteger> MIN_BIGINTS = new AggregatorFactory<BigInteger>() {
- public Aggregator<BigInteger> create() {
- return new MinBigInts();
- }
- };
-
- /**
- * @deprecated Use {@link Aggregators#MAX_N(int, Class)}
- */
- public static class MaxNAggregator<V extends Comparable<V>> extends SimpleAggregator<V> {
- private final int arity;
- private transient SortedSet<V> elements;
-
- public MaxNAggregator(int arity) {
- this.arity = arity;
- }
-
- @Override
- public void reset() {
- if (elements == null) {
- elements = Sets.newTreeSet();
- } else {
- elements.clear();
- }
- }
-
- @Override
- public void update(V value) {
- if (elements.size() < arity) {
- elements.add(value);
- } else if (value.compareTo(elements.first()) > 0) {
- elements.remove(elements.first());
- elements.add(value);
- }
- }
-
- @Override
- public Iterable<V> results() {
- return ImmutableList.copyOf(elements);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#MIN_N(int, Class)}
- */
- public static class MinNAggregator<V extends Comparable<V>> extends SimpleAggregator<V> {
- private final int arity;
- private transient SortedSet<V> elements;
-
- public MinNAggregator(int arity) {
- this.arity = arity;
- }
-
- @Override
- public void reset() {
- if (elements == null) {
- elements = Sets.newTreeSet();
- } else {
- elements.clear();
- }
- }
-
- @Override
- public void update(V value) {
- if (elements.size() < arity) {
- elements.add(value);
- } else if (value.compareTo(elements.last()) < 0) {
- elements.remove(elements.last());
- elements.add(value);
- }
- }
-
- @Override
- public Iterable<V> results() {
- return ImmutableList.copyOf(elements);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#FIRST_N(int)}
- */
- public static class FirstNAggregator<V> extends SimpleAggregator<V> {
- private final int arity;
- private final List<V> elements;
-
- public FirstNAggregator(int arity) {
- this.arity = arity;
- this.elements = Lists.newArrayList();
- }
-
- @Override
- public void reset() {
- elements.clear();
- }
-
- @Override
- public void update(V value) {
- if (elements.size() < arity) {
- elements.add(value);
- }
- }
-
- @Override
- public Iterable<V> results() {
- return ImmutableList.copyOf(elements);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#LAST_N(int)}
- */
- public static class LastNAggregator<V> extends SimpleAggregator<V> {
- private final int arity;
- private final LinkedList<V> elements;
-
- public LastNAggregator(int arity) {
- this.arity = arity;
- this.elements = Lists.newLinkedList();
- }
-
- @Override
- public void reset() {
- elements.clear();
- }
-
- @Override
- public void update(V value) {
- elements.add(value);
- if (elements.size() == arity + 1) {
- elements.removeFirst();
- }
- }
-
- @Override
- public Iterable<V> results() {
- return ImmutableList.copyOf(elements);
- }
- }
-
- /**
- * @deprecated Use {@link Aggregators#STRING_CONCAT(String, boolean, long, long)}
- */
- public static class StringConcatAggregator extends SimpleAggregator<String> {
- private final String separator;
- private final boolean skipNulls;
- private final long maxOutputLength;
- private final long maxInputLength;
- private long currentLength;
- private final LinkedList<String> list = new LinkedList<String>();
-
- private transient Joiner joiner;
-
- public StringConcatAggregator(final String separator, final boolean skipNulls) {
- this.separator = separator;
- this.skipNulls = skipNulls;
- this.maxInputLength = 0;
- this.maxOutputLength = 0;
- }
-
- public StringConcatAggregator(final String separator, final boolean skipNull, final long maxOutputLength, final long maxInputLength) {
- this.separator = separator;
- this.skipNulls = skipNull;
- this.maxOutputLength = maxOutputLength;
- this.maxInputLength = maxInputLength;
- this.currentLength = -separator.length();
- }
-
- @Override
- public void reset() {
- if (joiner == null) {
- joiner = skipNulls ? Joiner.on(separator).skipNulls() : Joiner.on(separator);
- }
- currentLength = -separator.length();
- list.clear();
- }
-
- @Override
- public void update(final String next) {
- long length = (next == null) ? 0 : next.length() + separator.length();
- if (maxOutputLength > 0 && currentLength + length > maxOutputLength || maxInputLength > 0 && next.length() > maxInputLength) {
- return;
- }
- if (maxOutputLength > 0) {
- currentLength += length;
- }
- list.add(next);
- }
-
- @Override
- public Iterable<String> results() {
- return ImmutableList.of(joiner.join(list));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/CrunchRuntimeException.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/CrunchRuntimeException.java b/crunch/src/main/java/org/apache/crunch/CrunchRuntimeException.java
deleted file mode 100644
index 044f600..0000000
--- a/crunch/src/main/java/org/apache/crunch/CrunchRuntimeException.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-/**
- * A {@code RuntimeException} implementation that includes some additional options
- * for the Crunch execution engine to track reporting status. Clients may
- * use instances of this class in their own {@code DoFn} implementations.
- */
-public class CrunchRuntimeException extends RuntimeException {
-
- private boolean logged = false;
-
- public CrunchRuntimeException(String msg) {
- super(msg);
- }
-
- public CrunchRuntimeException(Exception e) {
- super(e);
- }
-
- public CrunchRuntimeException(String msg, Exception e) {
- super(msg, e);
- }
-
- /**
- * Returns true if this exception was written to the debug logs.
- */
- public boolean wasLogged() {
- return logged;
- }
-
- /**
- * Indicate that this exception has been written to the debug logs.
- */
- public void markLogged() {
- this.logged = true;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/DoFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/DoFn.java b/crunch/src/main/java/org/apache/crunch/DoFn.java
deleted file mode 100644
index 2c6389a..0000000
--- a/crunch/src/main/java/org/apache/crunch/DoFn.java
+++ /dev/null
@@ -1,162 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import java.io.Serializable;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.Counter;
-import org.apache.hadoop.mapreduce.TaskAttemptID;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-
-/**
- * Base class for all data processing functions in Crunch.
- *
- * <p>
- * Note that all {@code DoFn} instances implement {@link Serializable}, and thus
- * all of their non-transient member variables must implement
- * {@code Serializable} as well. If your DoFn depends on non-serializable
- * classes for data processing, they may be declared as {@code transient} and
- * initialized in the DoFn's {@code initialize} method.
- *
- */
-public abstract class DoFn<S, T> implements Serializable {
- private transient TaskInputOutputContext<?, ?, ?, ?> context;
-
- /**
- * Configure this DoFn. Subclasses may override this method to modify the
- * configuration of the Job that this DoFn instance belongs to.
- *
- * <p>
- * Called during the job planning phase by the crunch-client.
- * </p>
- *
- * @param conf
- * The Configuration instance for the Job.
- */
- public void configure(Configuration conf) {
- }
-
- /**
- * Initialize this DoFn. This initialization will happen before the actual
- * {@link #process(Object, Emitter)} is triggered. Subclasses may override
- * this method to do appropriate initialization.
- *
- * <p>
- * Called during the setup of the job instance this {@code DoFn} is associated
- * with.
- * </p>
- *
- */
- public void initialize() {
- }
-
- /**
- * Processes the records from a {@link PCollection}.
- *
- * <br/>
- * <br/>
- * <b>Note:</b> Crunch can reuse a single input record object whose content
- * changes on each {@link #process(Object, Emitter)} method call. This
- * functionality is imposed by Hadoop's <a href=
- * "http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/mapred/Reducer.html"
- * >Reducer</a> implementation: <i>The framework will reuse the key and value
- * objects that are passed into the reduce, therefore the application should
- * clone the objects they want to keep a copy of.</i>
- *
- * @param input
- * The input record.
- * @param emitter
- * The emitter to send the output to
- */
- public abstract void process(S input, Emitter<T> emitter);
-
- /**
- * Called during the cleanup of the MapReduce job this {@code DoFn} is
- * associated with. Subclasses may override this method to do appropriate
- * cleanup.
- *
- * @param emitter
- * The emitter that was used for output
- */
- public void cleanup(Emitter<T> emitter) {
- }
-
- /**
- * Called during setup to pass the {@link TaskInputOutputContext} to this
- * {@code DoFn} instance.
- */
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- this.context = context;
- }
-
- /**
- * Returns an estimate of how applying this function to a {@link PCollection}
- * will cause it to change in side. The optimizer uses these estimates to
- * decide where to break up dependent MR jobs into separate Map and Reduce
- * phases in order to minimize I/O.
- *
- * <p>
- * Subclasses of {@code DoFn} that will substantially alter the size of the
- * resulting {@code PCollection} should override this method.
- */
- public float scaleFactor() {
- return 1.2f;
- }
-
- protected TaskInputOutputContext<?, ?, ?, ?> getContext() {
- return context;
- }
-
- protected Configuration getConfiguration() {
- return context.getConfiguration();
- }
-
- protected Counter getCounter(Enum<?> counterName) {
- return context.getCounter(counterName);
- }
-
- protected Counter getCounter(String groupName, String counterName) {
- return context.getCounter(groupName, counterName);
- }
-
- protected void increment(Enum<?> counterName) {
- increment(counterName, 1);
- }
-
- protected void increment(Enum<?> counterName, long value) {
- getCounter(counterName).increment(value);
- }
-
- protected void progress() {
- context.progress();
- }
-
- protected TaskAttemptID getTaskAttemptID() {
- return context.getTaskAttemptID();
- }
-
- protected void setStatus(String status) {
- context.setStatus(status);
- }
-
- protected String getStatus() {
- return context.getStatus();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/Emitter.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/Emitter.java b/crunch/src/main/java/org/apache/crunch/Emitter.java
deleted file mode 100644
index d104a09..0000000
--- a/crunch/src/main/java/org/apache/crunch/Emitter.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-/**
- * Interface for writing outputs from a {@link DoFn}.
- *
- */
-public interface Emitter<T> {
- /**
- * Write the emitted value to the next stage of the pipeline.
- *
- * @param emitted
- * The value to write
- */
- void emit(T emitted);
-
- /**
- * Flushes any values cached by this emitter. Called during the cleanup stage.
- */
- void flush();
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/FilterFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/FilterFn.java b/crunch/src/main/java/org/apache/crunch/FilterFn.java
deleted file mode 100644
index 440f122..0000000
--- a/crunch/src/main/java/org/apache/crunch/FilterFn.java
+++ /dev/null
@@ -1,244 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import java.util.List;
-
-import org.apache.crunch.fn.FilterFns;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-
-import com.google.common.collect.ImmutableList;
-
-/**
- * A {@link DoFn} for the common case of filtering the members of a
- * {@link PCollection} based on a boolean condition.
- */
-public abstract class FilterFn<T> extends DoFn<T, T> {
-
- /**
- * If true, emit the given record.
- */
- public abstract boolean accept(T input);
-
- @Override
- public void process(T input, Emitter<T> emitter) {
- if (accept(input)) {
- emitter.emit(input);
- }
- }
-
- @Override
- public final void cleanup(Emitter<T> emitter) {
- cleanup();
- }
-
- /**
- * Called during the cleanup of the MapReduce job this {@code FilterFn} is
- * associated with. Subclasses may override this method to do appropriate
- * cleanup.
- */
- public void cleanup() {
- }
-
- @Override
- public float scaleFactor() {
- return 0.5f;
- }
-
- /**
- * @deprecated Use {@link FilterFns#and(FilterFn...)}
- */
- public static <S> FilterFn<S> and(FilterFn<S>... fns) {
- return new AndFn<S>(fns);
- }
-
- /**
- * @deprecated Use {@link FilterFns#and(FilterFn...)}
- */
- public static class AndFn<S> extends FilterFn<S> {
-
- private final List<FilterFn<S>> fns;
-
- public AndFn(FilterFn<S>... fns) {
- this.fns = ImmutableList.<FilterFn<S>> copyOf(fns);
- }
-
- @Override
- public void configure(Configuration conf) {
- for (FilterFn<S> fn : fns) {
- fn.configure(conf);
- }
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- for (FilterFn<S> fn : fns) {
- fn.setContext(context);
- }
- }
-
- @Override
- public void initialize() {
- for (FilterFn<S> fn : fns) {
- fn.initialize();
- }
- }
-
- @Override
- public void cleanup() {
- for (FilterFn<S> fn : fns) {
- fn.cleanup();
- }
- }
-
- @Override
- public boolean accept(S input) {
- for (FilterFn<S> fn : fns) {
- if (!fn.accept(input)) {
- return false;
- }
- }
- return true;
- }
-
- @Override
- public float scaleFactor() {
- float scaleFactor = 1.0f;
- for (FilterFn<S> fn : fns) {
- scaleFactor *= fn.scaleFactor();
- }
- return scaleFactor;
- }
- }
-
- /**
- * @deprecated Use {@link FilterFns#or(FilterFn...)}
- */
- public static <S> FilterFn<S> or(FilterFn<S>... fns) {
- return new OrFn<S>(fns);
- }
-
- /**
- * @deprecated Use {@link FilterFns#or(FilterFn...)}
- */
- public static class OrFn<S> extends FilterFn<S> {
-
- private final List<FilterFn<S>> fns;
-
- public OrFn(FilterFn<S>... fns) {
- this.fns = ImmutableList.<FilterFn<S>> copyOf(fns);
- }
-
- @Override
- public void configure(Configuration conf) {
- for (FilterFn<S> fn : fns) {
- fn.configure(conf);
- }
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- for (FilterFn<S> fn : fns) {
- fn.setContext(context);
- }
- }
-
- @Override
- public void initialize() {
- for (FilterFn<S> fn : fns) {
- fn.initialize();
- }
- }
-
- @Override
- public void cleanup() {
- for (FilterFn<S> fn : fns) {
- fn.cleanup();
- }
- }
-
- @Override
- public boolean accept(S input) {
- for (FilterFn<S> fn : fns) {
- if (fn.accept(input)) {
- return true;
- }
- }
- return false;
- }
-
- @Override
- public float scaleFactor() {
- float scaleFactor = 0.0f;
- for (FilterFn<S> fn : fns) {
- scaleFactor += fn.scaleFactor();
- }
- return Math.min(1.0f, scaleFactor);
- }
- }
-
- /**
- * @deprecated Use {@link FilterFns#not(FilterFn)}
- */
- public static <S> FilterFn<S> not(FilterFn<S> fn) {
- return new NotFn<S>(fn);
- }
-
- /**
- * @deprecated Use {@link FilterFns#not(FilterFn)}
- */
- public static class NotFn<S> extends FilterFn<S> {
-
- private final FilterFn<S> base;
-
- public NotFn(FilterFn<S> base) {
- this.base = base;
- }
-
- @Override
- public void configure(Configuration conf) {
- base.configure(conf);
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- base.setContext(context);
- }
-
- @Override
- public void initialize() {
- base.initialize();
- }
-
- @Override
- public void cleanup() {
- base.cleanup();
- }
-
- @Override
- public boolean accept(S input) {
- return !base.accept(input);
- }
-
- @Override
- public float scaleFactor() {
- return 1.0f - base.scaleFactor();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/GroupingOptions.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/GroupingOptions.java b/crunch/src/main/java/org/apache/crunch/GroupingOptions.java
deleted file mode 100644
index 4aa1343..0000000
--- a/crunch/src/main/java/org/apache/crunch/GroupingOptions.java
+++ /dev/null
@@ -1,167 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.hadoop.io.RawComparator;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Partitioner;
-
-import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
-
-/**
- * Options that can be passed to a {@code groupByKey} operation in order to
- * exercise finer control over how the partitioning, grouping, and sorting of
- * keys is performed.
- *
- */
-public class GroupingOptions {
-
- private final Class<? extends Partitioner> partitionerClass;
- private final Class<? extends RawComparator> groupingComparatorClass;
- private final Class<? extends RawComparator> sortComparatorClass;
- private final int numReducers;
- private final Map<String, String> extraConf;
- private final Set<SourceTarget<?>> sourceTargets;
-
- private GroupingOptions(Class<? extends Partitioner> partitionerClass,
- Class<? extends RawComparator> groupingComparatorClass, Class<? extends RawComparator> sortComparatorClass,
- int numReducers, Map<String, String> extraConf, Set<SourceTarget<?>> sourceTargets) {
- this.partitionerClass = partitionerClass;
- this.groupingComparatorClass = groupingComparatorClass;
- this.sortComparatorClass = sortComparatorClass;
- this.numReducers = numReducers;
- this.extraConf = extraConf;
- this.sourceTargets = sourceTargets;
- }
-
- public int getNumReducers() {
- return numReducers;
- }
-
- public Class<? extends RawComparator> getSortComparatorClass() {
- return sortComparatorClass;
- }
-
- public Class<? extends RawComparator> getGroupingComparatorClass() {
- return groupingComparatorClass;
- }
-
- public Class<? extends Partitioner> getPartitionerClass() {
- return partitionerClass;
- }
-
- public Set<SourceTarget<?>> getSourceTargets() {
- return sourceTargets;
- }
-
- public void configure(Job job) {
- if (partitionerClass != null) {
- job.setPartitionerClass(partitionerClass);
- }
- if (groupingComparatorClass != null) {
- job.setGroupingComparatorClass(groupingComparatorClass);
- }
- if (sortComparatorClass != null) {
- job.setSortComparatorClass(sortComparatorClass);
- }
- if (numReducers > 0) {
- job.setNumReduceTasks(numReducers);
- }
- for (Map.Entry<String, String> e : extraConf.entrySet()) {
- job.getConfiguration().set(e.getKey(), e.getValue());
- }
- }
-
- public boolean isCompatibleWith(GroupingOptions other) {
- if (partitionerClass != other.partitionerClass) {
- return false;
- }
- if (groupingComparatorClass != other.groupingComparatorClass) {
- return false;
- }
- if (sortComparatorClass != other.sortComparatorClass) {
- return false;
- }
- if (!extraConf.equals(other.extraConf)) {
- return false;
- }
- return true;
- }
-
- public static Builder builder() {
- return new Builder();
- }
-
- /**
- * Builder class for creating {@code GroupingOptions} instances.
- *
- */
- public static class Builder {
- private Class<? extends Partitioner> partitionerClass;
- private Class<? extends RawComparator> groupingComparatorClass;
- private Class<? extends RawComparator> sortComparatorClass;
- private int numReducers;
- private Map<String, String> extraConf = Maps.newHashMap();
- private Set<SourceTarget<?>> sourceTargets = Sets.newHashSet();
-
- public Builder() {
- }
-
- public Builder partitionerClass(Class<? extends Partitioner> partitionerClass) {
- this.partitionerClass = partitionerClass;
- return this;
- }
-
- public Builder groupingComparatorClass(Class<? extends RawComparator> groupingComparatorClass) {
- this.groupingComparatorClass = groupingComparatorClass;
- return this;
- }
-
- public Builder sortComparatorClass(Class<? extends RawComparator> sortComparatorClass) {
- this.sortComparatorClass = sortComparatorClass;
- return this;
- }
-
- public Builder numReducers(int numReducers) {
- if (numReducers <= 0) {
- throw new IllegalArgumentException("Invalid number of reducers: " + numReducers);
- }
- this.numReducers = numReducers;
- return this;
- }
-
- public Builder conf(String confKey, String confValue) {
- this.extraConf.put(confKey, confValue);
- return this;
- }
-
- public Builder sourceTarget(SourceTarget<?> st) {
- this.sourceTargets.add(st);
- return this;
- }
-
- public GroupingOptions build() {
- return new GroupingOptions(partitionerClass, groupingComparatorClass, sortComparatorClass,
- numReducers, extraConf, sourceTargets);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/MapFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/MapFn.java b/crunch/src/main/java/org/apache/crunch/MapFn.java
deleted file mode 100644
index dbf172e..0000000
--- a/crunch/src/main/java/org/apache/crunch/MapFn.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-/**
- * A {@link DoFn} for the common case of emitting exactly one value for each
- * input record.
- *
- */
-public abstract class MapFn<S, T> extends DoFn<S, T> {
-
- /**
- * Maps the given input into an instance of the output type.
- */
- public abstract T map(S input);
-
- @Override
- public void process(S input, Emitter<T> emitter) {
- emitter.emit(map(input));
- }
-
- @Override
- public float scaleFactor() {
- return 1.0f;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/PCollection.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/PCollection.java b/crunch/src/main/java/org/apache/crunch/PCollection.java
deleted file mode 100644
index 6f5abf6..0000000
--- a/crunch/src/main/java/org/apache/crunch/PCollection.java
+++ /dev/null
@@ -1,245 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import java.util.Collection;
-
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-
-/**
- * A representation of an immutable, distributed collection of elements that is
- * the fundamental target of computations in Crunch.
- *
- */
-public interface PCollection<S> {
- /**
- * Returns the {@code Pipeline} associated with this PCollection.
- */
- Pipeline getPipeline();
-
- /**
- * Returns a {@code PCollection} instance that acts as the union of this
- * {@code PCollection} and the given {@code PCollection}.
- */
- PCollection<S> union(PCollection<S> other);
-
- /**
- * Returns a {@code PCollection} instance that acts as the union of this
- * {@code PCollection} and the input {@code PCollection}s.
- */
- PCollection<S> union(PCollection<S>... collections);
-
- /**
- * Applies the given doFn to the elements of this {@code PCollection} and
- * returns a new {@code PCollection} that is the output of this processing.
- *
- * @param doFn
- * The {@code DoFn} to apply
- * @param type
- * The {@link PType} of the resulting {@code PCollection}
- * @return a new {@code PCollection}
- */
- <T> PCollection<T> parallelDo(DoFn<S, T> doFn, PType<T> type);
-
- /**
- * Applies the given doFn to the elements of this {@code PCollection} and
- * returns a new {@code PCollection} that is the output of this processing.
- *
- * @param name
- * An identifier for this processing step, useful for debugging
- * @param doFn
- * The {@code DoFn} to apply
- * @param type
- * The {@link PType} of the resulting {@code PCollection}
- * @return a new {@code PCollection}
- */
- <T> PCollection<T> parallelDo(String name, DoFn<S, T> doFn, PType<T> type);
-
- /**
- * Applies the given doFn to the elements of this {@code PCollection} and
- * returns a new {@code PCollection} that is the output of this processing.
- *
- * @param name
- * An identifier for this processing step, useful for debugging
- * @param doFn
- * The {@code DoFn} to apply
- * @param type
- * The {@link PType} of the resulting {@code PCollection}
- * @param options
- * Optional information that is needed for certain pipeline operations
- * @return a new {@code PCollection}
- */
- <T> PCollection<T> parallelDo(String name, DoFn<S, T> doFn, PType<T> type,
- ParallelDoOptions options);
-
- /**
- * Similar to the other {@code parallelDo} instance, but returns a
- * {@code PTable} instance instead of a {@code PCollection}.
- *
- * @param doFn
- * The {@code DoFn} to apply
- * @param type
- * The {@link PTableType} of the resulting {@code PTable}
- * @return a new {@code PTable}
- */
- <K, V> PTable<K, V> parallelDo(DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type);
-
- /**
- * Similar to the other {@code parallelDo} instance, but returns a
- * {@code PTable} instance instead of a {@code PCollection}.
- *
- * @param name
- * An identifier for this processing step
- * @param doFn
- * The {@code DoFn} to apply
- * @param type
- * The {@link PTableType} of the resulting {@code PTable}
- * @return a new {@code PTable}
- */
- <K, V> PTable<K, V> parallelDo(String name, DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type);
-
- /**
- * Similar to the other {@code parallelDo} instance, but returns a
- * {@code PTable} instance instead of a {@code PCollection}.
- *
- * @param name
- * An identifier for this processing step
- * @param doFn
- * The {@code DoFn} to apply
- * @param type
- * The {@link PTableType} of the resulting {@code PTable}
- * @param options
- * Optional information that is needed for certain pipeline operations
- * @return a new {@code PTable}
- */
- <K, V> PTable<K, V> parallelDo(String name, DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type,
- ParallelDoOptions options);
-
- /**
- * Write the contents of this {@code PCollection} to the given {@code Target},
- * using the storage format specified by the target.
- *
- * @param target
- * The target to write to
- */
- PCollection<S> write(Target target);
-
- /**
- * Write the contents of this {@code PCollection} to the given {@code Target},
- * using the given {@code Target.WriteMode} to handle existing
- * targets.
- *
- * @param target
- * The target
- * @param writeMode
- * The rule for handling existing outputs at the target location
- */
- PCollection<S> write(Target target, Target.WriteMode writeMode);
-
- /**
- * Returns a reference to the data set represented by this PCollection that
- * may be used by the client to read the data locally.
- */
- Iterable<S> materialize();
-
- /**
- * @return A {@code PObject} encapsulating an in-memory {@link Collection} containing the values
- * of this {@code PCollection}.
- */
- PObject<Collection<S>> asCollection();
-
- /**
- * Returns the {@code PType} of this {@code PCollection}.
- */
- PType<S> getPType();
-
- /**
- * Returns the {@code PTypeFamily} of this {@code PCollection}.
- */
- PTypeFamily getTypeFamily();
-
- /**
- * Returns the size of the data represented by this {@code PCollection} in
- * bytes.
- */
- long getSize();
-
- /**
- * Returns the number of elements represented by this {@code PCollection}.
- *
- * @return An {@code PObject} containing the number of elements in this {@code PCollection}.
- */
- PObject<Long> length();
-
- /**
- * Returns a shorthand name for this PCollection.
- */
- String getName();
-
- /**
- * Apply the given filter function to this instance and return the resulting
- * {@code PCollection}.
- */
- PCollection<S> filter(FilterFn<S> filterFn);
-
- /**
- * Apply the given filter function to this instance and return the resulting
- * {@code PCollection}.
- *
- * @param name
- * An identifier for this processing step
- * @param filterFn
- * The {@code FilterFn} to apply
- */
- PCollection<S> filter(String name, FilterFn<S> filterFn);
-
- /**
- * Apply the given map function to each element of this instance in order to
- * create a {@code PTable}.
- */
- <K> PTable<K, S> by(MapFn<S, K> extractKeyFn, PType<K> keyType);
-
- /**
- * Apply the given map function to each element of this instance in order to
- * create a {@code PTable}.
- *
- * @param name
- * An identifier for this processing step
- * @param extractKeyFn
- * The {@code MapFn} to apply
- */
- <K> PTable<K, S> by(String name, MapFn<S, K> extractKeyFn, PType<K> keyType);
-
- /**
- * Returns a {@code PTable} instance that contains the counts of each unique
- * element of this PCollection.
- */
- PTable<S, Long> count();
-
- /**
- * Returns a {@code PObject} of the maximum element of this instance.
- */
- PObject<S> max();
-
- /**
- * Returns a {@code PObject} of the minimum element of this instance.
- */
- PObject<S> min();
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/PGroupedTable.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/PGroupedTable.java b/crunch/src/main/java/org/apache/crunch/PGroupedTable.java
deleted file mode 100644
index d77ffdb..0000000
--- a/crunch/src/main/java/org/apache/crunch/PGroupedTable.java
+++ /dev/null
@@ -1,53 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import org.apache.crunch.Aggregator;
-
-/**
- * The Crunch representation of a grouped {@link PTable}.
- *
- */
-public interface PGroupedTable<K, V> extends PCollection<Pair<K, Iterable<V>>> {
-
- /**
- * Combines the values of this grouping using the given {@code CombineFn}.
- *
- * @param combineFn
- * The combiner function
- * @return A {@code PTable} where each key has a single value
- */
- PTable<K, V> combineValues(CombineFn<K, V> combineFn);
-
- /**
- * Combine the values in each group using the given {@link Aggregator}.
- *
- * @param aggregator The function to use
- * @return A {@link PTable} where each group key maps to an aggregated
- * value. Group keys may be repeated if an aggregator returns
- * more than one value.
- */
- PTable<K, V> combineValues(Aggregator<V> aggregator);
-
- /**
- * Convert this grouping back into a multimap.
- *
- * @return an ungrouped version of the data in this {@code PGroupedTable}.
- */
- PTable<K, V> ungroup();
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/PObject.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/PObject.java b/crunch/src/main/java/org/apache/crunch/PObject.java
deleted file mode 100644
index 897a01f..0000000
--- a/crunch/src/main/java/org/apache/crunch/PObject.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-/**
- * A {@code PObject} represents a singleton object value that results from a distributed
- * computation. Computation producing the value is deferred until
- * {@link org.apache.crunch.PObject#getValue()} is called.
- *
- * @param <T> The type of value encapsulated by this {@code PObject}.
- */
-public interface PObject<T> {
- /**
- * Gets the value associated with this {@code PObject}. Calling this method will trigger
- * whatever computation is necessary to obtain the value and block until that computation
- * succeeds.
- *
- * @return The value associated with this {@code PObject}.
- */
- T getValue();
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/PTable.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/PTable.java b/crunch/src/main/java/org/apache/crunch/PTable.java
deleted file mode 100644
index 8df9853..0000000
--- a/crunch/src/main/java/org/apache/crunch/PTable.java
+++ /dev/null
@@ -1,181 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import java.util.Collection;
-import java.util.Map;
-
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-
-/**
- * A sub-interface of {@code PCollection} that represents an immutable,
- * distributed multi-map of keys and values.
- *
- */
-public interface PTable<K, V> extends PCollection<Pair<K, V>> {
-
- /**
- Returns a {@code PTable} instance that acts as the union of this
- * {@code PTable} and the other {@code PTable}s.
- */
- PTable<K, V> union(PTable<K, V> other);
-
- /**
- * Returns a {@code PTable} instance that acts as the union of this
- * {@code PTable} and the input {@code PTable}s.
- */
- PTable<K, V> union(PTable<K, V>... others);
-
- /**
- * Performs a grouping operation on the keys of this table.
- *
- * @return a {@code PGroupedTable} instance that represents the grouping
- */
- PGroupedTable<K, V> groupByKey();
-
- /**
- * Performs a grouping operation on the keys of this table, using the given
- * number of partitions.
- *
- * @param numPartitions
- * The number of partitions for the data.
- * @return a {@code PGroupedTable} instance that represents this grouping
- */
- PGroupedTable<K, V> groupByKey(int numPartitions);
-
- /**
- * Performs a grouping operation on the keys of this table, using the
- * additional {@code GroupingOptions} to control how the grouping is executed.
- *
- * @param options
- * The grouping options to use
- * @return a {@code PGroupedTable} instance that represents the grouping
- */
- PGroupedTable<K, V> groupByKey(GroupingOptions options);
-
- /**
- * Writes this {@code PTable} to the given {@code Target}.
- */
- PTable<K, V> write(Target target);
-
- /**
- * Writes this {@code PTable} to the given {@code Target}, using the
- * given {@code Target.WriteMode} to handle existing targets.
- */
- PTable<K, V> write(Target target, Target.WriteMode writeMode);
-
- /**
- * Returns the {@code PTableType} of this {@code PTable}.
- */
- PTableType<K, V> getPTableType();
-
- /**
- * Returns the {@code PType} of the key.
- */
- PType<K> getKeyType();
-
- /**
- * Returns the {@code PType} of the value.
- */
- PType<V> getValueType();
-
- /**
- * Aggregate all of the values with the same key into a single key-value pair
- * in the returned PTable.
- */
- PTable<K, Collection<V>> collectValues();
-
- /**
- * Apply the given filter function to this instance and return the resulting
- * {@code PTable}.
- */
- PTable<K, V> filter(FilterFn<Pair<K, V>> filterFn);
-
- /**
- * Apply the given filter function to this instance and return the resulting
- * {@code PTable}.
- *
- * @param name
- * An identifier for this processing step
- * @param filterFn
- * The {@code FilterFn} to apply
- */
- PTable<K, V> filter(String name, FilterFn<Pair<K, V>> filterFn);
-
- /**
- * Returns a PTable made up of the pairs in this PTable with the largest value
- * field.
- *
- * @param count
- * The number of pairs to return
- */
- PTable<K, V> top(int count);
-
- /**
- * Returns a PTable made up of the pairs in this PTable with the smallest
- * value field.
- *
- * @param count
- * The number of pairs to return
- */
- PTable<K, V> bottom(int count);
-
- /**
- * Perform an inner join on this table and the one passed in as an argument on
- * their common keys.
- */
- <U> PTable<K, Pair<V, U>> join(PTable<K, U> other);
-
- /**
- * Co-group operation with the given table on common keys.
- */
- <U> PTable<K, Pair<Collection<V>, Collection<U>>> cogroup(PTable<K, U> other);
-
- /**
- * Returns a {@link PCollection} made up of the keys in this PTable.
- */
- PCollection<K> keys();
-
- /**
- * Returns a {@link PCollection} made up of the values in this PTable.
- */
- PCollection<V> values();
-
- /**
- * Returns a Map<K, V> made up of the keys and values in this PTable.
- * <p>
- * <b>Note:</b> The contents of the returned map may not be exactly the same
- * as this PTable, as a PTable is a multi-map (i.e. can contain multiple
- * values for a single key).
- */
- Map<K, V> materializeToMap();
-
- /**
- * Returns a {@link PObject} encapsulating a {@link Map} made up of the keys and values in this
- * {@code PTable}.
- * <p><b>Note:</b>The contents of the returned map may not be exactly the same as this PTable,
- * as a PTable is a multi-map (i.e. can contain multiple values for a single key).
- * </p>
- *
- * @return The {@code PObject} encapsulating a {@code Map} made up of the keys and values in
- * this {@code PTable}.
- */
- PObject<Map<K, V>> asMap();
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/Pair.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/Pair.java b/crunch/src/main/java/org/apache/crunch/Pair.java
deleted file mode 100644
index fd058b6..0000000
--- a/crunch/src/main/java/org/apache/crunch/Pair.java
+++ /dev/null
@@ -1,105 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import org.apache.commons.lang.builder.HashCodeBuilder;
-
-/**
- * A convenience class for two-element {@link Tuple}s.
- */
-public class Pair<K, V> implements Tuple, Comparable<Pair<K, V>> {
-
- private final K first;
- private final V second;
-
- public static <T, U> Pair<T, U> of(T first, U second) {
- return new Pair<T, U>(first, second);
- }
-
- public Pair(K first, V second) {
- this.first = first;
- this.second = second;
- }
-
- public K first() {
- return first;
- }
-
- public V second() {
- return second;
- }
-
- public Object get(int index) {
- switch (index) {
- case 0:
- return first;
- case 1:
- return second;
- default:
- throw new ArrayIndexOutOfBoundsException();
- }
- }
-
- public int size() {
- return 2;
- }
-
- @Override
- public int hashCode() {
- HashCodeBuilder hcb = new HashCodeBuilder();
- return hcb.append(first).append(second).toHashCode();
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
- return false;
- Pair<?, ?> other = (Pair<?, ?>) obj;
- return (first == other.first || (first != null && first.equals(other.first)))
- && (second == other.second || (second != null && second.equals(other.second)));
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder("[");
- sb.append(first).append(",").append(second).append("]");
- return sb.toString();
- }
-
- private int cmp(Object lhs, Object rhs) {
- if (lhs == rhs) {
- return 0;
- } else if (lhs != null && Comparable.class.isAssignableFrom(lhs.getClass())) {
- return ((Comparable) lhs).compareTo(rhs);
- }
- return (lhs == null ? 0 : lhs.hashCode()) - (rhs == null ? 0 : rhs.hashCode());
- }
-
- @Override
- public int compareTo(Pair<K, V> o) {
- int diff = cmp(first, o.first);
- if (diff == 0) {
- diff = cmp(second, o.second);
- }
- return diff;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/ParallelDoOptions.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/ParallelDoOptions.java b/crunch/src/main/java/org/apache/crunch/ParallelDoOptions.java
deleted file mode 100644
index 2407b3a..0000000
--- a/crunch/src/main/java/org/apache/crunch/ParallelDoOptions.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import java.util.Collections;
-import java.util.Set;
-
-import com.google.common.collect.Sets;
-
-/**
- * Container class that includes optional information about a {@code parallelDo} operation
- * applied to a {@code PCollection}. Primarily used within the Crunch framework
- * itself for certain types of advanced processing operations, such as in-memory joins
- * that require reading a file from the filesystem into a {@code DoFn}.
- */
-public class ParallelDoOptions {
- private final Set<SourceTarget<?>> sourceTargets;
-
- private ParallelDoOptions(Set<SourceTarget<?>> sourceTargets) {
- this.sourceTargets = sourceTargets;
- }
-
- public Set<SourceTarget<?>> getSourceTargets() {
- return sourceTargets;
- }
-
- public static Builder builder() {
- return new Builder();
- }
-
- public static class Builder {
- private Set<SourceTarget<?>> sourceTargets;
-
- public Builder() {
- this.sourceTargets = Sets.newHashSet();
- }
-
- public Builder sourceTargets(SourceTarget<?>... sourceTargets) {
- Collections.addAll(this.sourceTargets, sourceTargets);
- return this;
- }
-
- public ParallelDoOptions build() {
- return new ParallelDoOptions(sourceTargets);
- }
- }
-}
[22/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/writable/WritablesTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/writable/WritablesTest.java b/crunch-core/src/test/java/org/apache/crunch/types/writable/WritablesTest.java
new file mode 100644
index 0000000..5396fba
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/writable/WritablesTest.java
@@ -0,0 +1,256 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertSame;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class WritablesTest {
+
+ @Test
+ public void testNulls() throws Exception {
+ Void n = null;
+ NullWritable nw = NullWritable.get();
+ testInputOutputFn(Writables.nulls(), n, nw);
+ }
+
+ @Test
+ public void testStrings() throws Exception {
+ String s = "abc";
+ Text text = new Text(s);
+ testInputOutputFn(Writables.strings(), s, text);
+ }
+
+ @Test
+ public void testInts() throws Exception {
+ int j = 55;
+ IntWritable w = new IntWritable(j);
+ testInputOutputFn(Writables.ints(), j, w);
+ }
+
+ @Test
+ public void testLongs() throws Exception {
+ long j = 55;
+ LongWritable w = new LongWritable(j);
+ testInputOutputFn(Writables.longs(), j, w);
+ }
+
+ @Test
+ public void testFloats() throws Exception {
+ float j = 55.5f;
+ FloatWritable w = new FloatWritable(j);
+ testInputOutputFn(Writables.floats(), j, w);
+ }
+
+ @Test
+ public void testDoubles() throws Exception {
+ double j = 55.5d;
+ DoubleWritable w = new DoubleWritable(j);
+ testInputOutputFn(Writables.doubles(), j, w);
+ }
+
+ @Test
+ public void testBoolean() throws Exception {
+ boolean j = false;
+ BooleanWritable w = new BooleanWritable(j);
+ testInputOutputFn(Writables.booleans(), j, w);
+ }
+
+ @Test
+ public void testBytes() throws Exception {
+ byte[] bytes = new byte[] { 17, 26, -98 };
+ BytesWritable bw = new BytesWritable(bytes);
+ ByteBuffer bb = ByteBuffer.wrap(bytes);
+ testInputOutputFn(Writables.bytes(), bb, bw);
+ }
+
+ @Test
+ public void testCollections() throws Exception {
+ String s = "abc";
+ Collection<String> j = Lists.newArrayList();
+ j.add(s);
+ GenericArrayWritable<Text> w = new GenericArrayWritable<Text>(Text.class);
+ w.set(new Text[] { new Text(s) });
+ testInputOutputFn(Writables.collections(Writables.strings()), j, w);
+ }
+
+ @Test
+ public void testPairs() throws Exception {
+ Pair<String, String> j = Pair.of("a", "b");
+ TupleWritable w = new TupleWritable(new Text[] { new Text("a"), new Text("b"), });
+ w.setWritten(0);
+ w.setWritten(1);
+ testInputOutputFn(Writables.pairs(Writables.strings(), Writables.strings()), j, w);
+ }
+
+ @Test
+ public void testNestedTables() throws Exception {
+ PTableType<Long, Long> pll = Writables.tableOf(Writables.longs(), Writables.longs());
+ PTableType<Pair<Long, Long>, String> nest = Writables.tableOf(pll, Writables.strings());
+ assertNotNull(nest);
+ }
+
+ @Test
+ public void testPairEquals() throws Exception {
+ PType<Pair<Long, ByteBuffer>> t1 = Writables.pairs(Writables.longs(), Writables.bytes());
+ PType<Pair<Long, ByteBuffer>> t2 = Writables.pairs(Writables.longs(), Writables.bytes());
+ assertEquals(t1, t2);
+ assertEquals(t1.hashCode(), t2.hashCode());
+ }
+
+ @Test
+ @SuppressWarnings("rawtypes")
+ public void testTriples() throws Exception {
+ Tuple3 j = Tuple3.of("a", "b", "c");
+ TupleWritable w = new TupleWritable(new Text[] { new Text("a"), new Text("b"), new Text("c"), });
+ w.setWritten(0);
+ w.setWritten(1);
+ w.setWritten(2);
+ WritableType<?, ?> wt = Writables.triples(Writables.strings(), Writables.strings(), Writables.strings());
+ testInputOutputFn(wt, j, w);
+ }
+
+ @Test
+ @SuppressWarnings("rawtypes")
+ public void testQuads() throws Exception {
+ Tuple4 j = Tuple4.of("a", "b", "c", "d");
+ TupleWritable w = new TupleWritable(new Text[] { new Text("a"), new Text("b"), new Text("c"), new Text("d"), });
+ w.setWritten(0);
+ w.setWritten(1);
+ w.setWritten(2);
+ w.setWritten(3);
+ WritableType<?, ?> wt = Writables.quads(Writables.strings(), Writables.strings(), Writables.strings(),
+ Writables.strings());
+ testInputOutputFn(wt, j, w);
+ }
+
+ @Test
+ public void testTupleN() throws Exception {
+ TupleN j = new TupleN("a", "b", "c", "d", "e");
+ TupleWritable w = new TupleWritable(new Text[] { new Text("a"), new Text("b"), new Text("c"), new Text("d"),
+ new Text("e"), });
+ w.setWritten(0);
+ w.setWritten(1);
+ w.setWritten(2);
+ w.setWritten(3);
+ w.setWritten(4);
+ WritableType<?, ?> wt = Writables.tuples(Writables.strings(), Writables.strings(), Writables.strings(),
+ Writables.strings(), Writables.strings());
+ testInputOutputFn(wt, j, w);
+ }
+
+ protected static class TestWritable implements Writable {
+ String left;
+ int right;
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeUTF(left);
+ out.writeInt(right);
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ left = in.readUTF();
+ right = in.readInt();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ TestWritable other = (TestWritable) obj;
+ if (left == null) {
+ if (other.left != null)
+ return false;
+ } else if (!left.equals(other.left))
+ return false;
+ if (right != other.right)
+ return false;
+ return true;
+ }
+
+ }
+
+ @Test
+ public void testRecords() throws Exception {
+ TestWritable j = new TestWritable();
+ j.left = "a";
+ j.right = 1;
+ TestWritable w = new TestWritable();
+ w.left = "a";
+ w.right = 1;
+ WritableType<?, ?> wt = Writables.records(TestWritable.class);
+ testInputOutputFn(wt, j, w);
+ }
+
+ @Test
+ public void testTableOf() throws Exception {
+ Pair<String, String> j = Pair.of("a", "b");
+ Pair<Text, Text> w = Pair.of(new Text("a"), new Text("b"));
+ WritableTableType<String, String> wtt = Writables.tableOf(Writables.strings(), Writables.strings());
+ testInputOutputFn(wtt, j, w);
+ }
+
+ @Test
+ public void testRegister() throws Exception {
+ WritableType<TestWritable, TestWritable> wt = Writables.writables(TestWritable.class);
+ Writables.register(TestWritable.class, wt);
+ assertSame(Writables.records(TestWritable.class), wt);
+ }
+
+ @SuppressWarnings({ "unchecked", "rawtypes" })
+ protected static void testInputOutputFn(PType ptype, Object java, Object writable) {
+ ptype.getInputMapFn().initialize();
+ ptype.getOutputMapFn().initialize();
+ assertEquals(java, ptype.getInputMapFn().map(writable));
+ assertEquals(writable, ptype.getOutputMapFn().map(java));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/util/DistCacheTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/util/DistCacheTest.java b/crunch-core/src/test/java/org/apache/crunch/util/DistCacheTest.java
new file mode 100644
index 0000000..6784f14
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/util/DistCacheTest.java
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.util;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class DistCacheTest {
+
+ // A temporary folder used to hold files created for the test.
+ @Rule
+ public TemporaryFolder testFolder = new TemporaryFolder();
+
+ // A configuration and lists of paths to use in tests.
+ private Configuration testConf;
+ private String[] testFilePaths;
+ private String[] testFileQualifiedPaths;
+
+ /**
+ * Setup resources for tests. These include:
+ * <ol>
+ * <li>A Hadoop configuration.
+ * <li>A directory of temporary files that includes 3 .jar files and 1 other
+ * file.
+ * <li>Arrays containing the canonical paths and qualified paths to the test
+ * files.
+ * </ol>
+ */
+ @Before
+ public void setup() throws IOException {
+ // Create a configuration for tests.
+ testConf = new Configuration();
+
+ // Create the test files and add their paths to the list of test file paths.
+ testFilePaths = new String[3];
+ testFilePaths[0] = testFolder.newFile("jar1.jar").getCanonicalPath();
+ testFilePaths[1] = testFolder.newFile("jar2.jar").getCanonicalPath();
+ testFilePaths[2] = testFolder.newFile("jar3.jar").getCanonicalPath();
+ testFolder.newFile("notJar.other");
+
+ // Populate a list of qualified paths from the test file paths.
+ testFileQualifiedPaths = new String[3];
+ for (int i = 0; i < testFilePaths.length; i++) {
+ testFileQualifiedPaths[i] = "file:" + testFilePaths[i];
+ }
+ }
+
+ /**
+ * Tests adding jars one-by-one to a job's configuration.
+ *
+ * @throws IOException
+ * If there is a problem adding the jars.
+ */
+ @Test
+ public void testAddJar() throws IOException {
+ // Add each valid jar path to the distributed cache configuration, and
+ // verify each was
+ // added correctly in turn.
+ for (int i = 0; i < testFilePaths.length; i++) {
+ DistCache.addJarToDistributedCache(testConf, testFilePaths[i]);
+ assertEquals("tmpjars configuration var does not contain expected value.",
+ StringUtils.join(testFileQualifiedPaths, ",", 0, i + 1), testConf.get("tmpjars"));
+ }
+ }
+
+ /**
+ * Tests that attempting to add the path to a jar that does not exist to the
+ * configuration throws an exception.
+ *
+ * @throws IOException
+ * If the added jar path does not exist. This exception is expected.
+ */
+ @Test(expected = IOException.class)
+ public void testAddJarThatDoesntExist() throws IOException {
+ DistCache.addJarToDistributedCache(testConf, "/garbage/doesntexist.jar");
+ }
+
+ /**
+ * Tests that adding a directory of jars to the configuration works as
+ * expected. .jar files under the added directory should be added to the
+ * configuration, and all other files should be skipped.
+ *
+ * @throws IOException
+ * If there is a problem adding the jar directory to the
+ * configuration.
+ */
+ @Test
+ public void testAddJarDirectory() throws IOException {
+ DistCache.addJarDirToDistributedCache(testConf, testFolder.getRoot().getCanonicalPath());
+ // Throw the added jar paths in a set to detect duplicates.
+ String[] splitJarPaths = StringUtils.split(testConf.get("tmpjars"), ",");
+ Set<String> addedJarPaths = new HashSet<String>();
+ for (String path : splitJarPaths) {
+ addedJarPaths.add(path);
+ }
+ assertEquals("Incorrect number of jar paths added.", testFilePaths.length, addedJarPaths.size());
+
+ // Ensure all expected paths were added.
+ for (int i = 0; i < testFileQualifiedPaths.length; i++) {
+ assertTrue("Expected jar path missing from jar paths added to tmpjars: " + testFileQualifiedPaths[i],
+ addedJarPaths.contains(testFileQualifiedPaths[i]));
+ }
+ }
+
+ /**
+ * Tests that adding a jar directory that does not exist to the configuration
+ * throws an exception.
+ *
+ * @throws IOException
+ * If the added jar directory does not exist. This exception is
+ * expected.
+ */
+ @Test(expected = IOException.class)
+ public void testAddJarDirectoryThatDoesntExist() throws IOException {
+ DistCache.addJarDirToDistributedCache(testConf, "/garbage/doesntexist");
+ }
+
+ /**
+ * Tests that adding a jar directory that is not a directory to the
+ * configuration throws an exception.
+ *
+ * @throws IOException
+ * If the added jar directory is not a directory. This exception is
+ * expected.
+ */
+ @Test(expected = IOException.class)
+ public void testAddJarDirectoryNotDirectory() throws IOException {
+ DistCache.addJarDirToDistributedCache(testConf, testFilePaths[0]);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-dist/pom.xml
----------------------------------------------------------------------
diff --git a/crunch-dist/pom.xml b/crunch-dist/pom.xml
index 749a767..cdd4256 100644
--- a/crunch-dist/pom.xml
+++ b/crunch-dist/pom.xml
@@ -35,7 +35,7 @@ under the License.
<dependencies>
<dependency>
<groupId>org.apache.crunch</groupId>
- <artifactId>crunch</artifactId>
+ <artifactId>crunch-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.crunch</groupId>
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-examples/pom.xml
----------------------------------------------------------------------
diff --git a/crunch-examples/pom.xml b/crunch-examples/pom.xml
index fd790c3..fcbe30c 100644
--- a/crunch-examples/pom.xml
+++ b/crunch-examples/pom.xml
@@ -36,7 +36,7 @@ under the License.
<dependency>
<groupId>org.apache.crunch</groupId>
- <artifactId>crunch</artifactId>
+ <artifactId>crunch-core</artifactId>
</dependency>
<dependency>
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-hbase/pom.xml
----------------------------------------------------------------------
diff --git a/crunch-hbase/pom.xml b/crunch-hbase/pom.xml
index 656c6cc..df21ef8 100644
--- a/crunch-hbase/pom.xml
+++ b/crunch-hbase/pom.xml
@@ -31,7 +31,7 @@ under the License.
<dependencies>
<dependency>
<groupId>org.apache.crunch</groupId>
- <artifactId>crunch</artifactId>
+ <artifactId>crunch-core</artifactId>
</dependency>
<dependency>
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-scrunch/pom.xml
----------------------------------------------------------------------
diff --git a/crunch-scrunch/pom.xml b/crunch-scrunch/pom.xml
index 7db5ac7..b97766a 100644
--- a/crunch-scrunch/pom.xml
+++ b/crunch-scrunch/pom.xml
@@ -43,7 +43,7 @@ under the License.
</dependency>
<dependency>
<groupId>org.apache.crunch</groupId>
- <artifactId>crunch</artifactId>
+ <artifactId>crunch-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/pom.xml
----------------------------------------------------------------------
diff --git a/crunch/pom.xml b/crunch/pom.xml
deleted file mode 100644
index 2a38913..0000000
--- a/crunch/pom.xml
+++ /dev/null
@@ -1,182 +0,0 @@
-<!--
-Licensed to the Apache Software Foundation (ASF) under one
-or more contributor license agreements. See the NOTICE file
-distributed with this work for additional information
-regarding copyright ownership. The ASF licenses this file
-to you under the Apache License, Version 2.0 (the
-"License"); you may not use this file except in compliance
-with the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing,
-software distributed under the License is distributed on an
-"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-KIND, either express or implied. See the License for the
-specific language governing permissions and limitations
-under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
-
- <modelVersion>4.0.0</modelVersion>
- <parent>
- <groupId>org.apache.crunch</groupId>
- <artifactId>crunch-parent</artifactId>
- <version>0.6.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>crunch</artifactId>
- <name>Apache Crunch Core</name>
-
- <dependencies>
- <dependency>
- <groupId>com.google.guava</groupId>
- <artifactId>guava</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.apache.avro</groupId>
- <artifactId>avro</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.apache.avro</groupId>
- <artifactId>avro-mapred</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.javassist</groupId>
- <artifactId>javassist</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-client</artifactId>
- <scope>provided</scope>
- </dependency>
-
- <!-- Override the slf4j dependency from Avro, which is incompatible with
- Hadoop's. -->
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- <scope>provided</scope>
- </dependency>
-
- <dependency>
- <groupId>commons-codec</groupId>
- <artifactId>commons-codec</artifactId>
- <scope>provided</scope>
- </dependency>
-
- <dependency>
- <groupId>org.codehaus.jackson</groupId>
- <artifactId>jackson-core-asl</artifactId>
- <scope>provided</scope>
- </dependency>
-
- <dependency>
- <groupId>org.codehaus.jackson</groupId>
- <artifactId>jackson-mapper-asl</artifactId>
- <scope>provided</scope>
- </dependency>
-
- <!-- Both Protobufs and Thrift are supported as
- derived serialization types, and you can use
- (almost) any version of them you like, Crunch
- only relies on the stable public APIs, not the
- structure of the files themselves.
-
- Both dependencies are scoped as provided, in
- order to not expand the size of the assembly jars
- unnecessarily.
- -->
-
- <dependency>
- <groupId>com.google.protobuf</groupId>
- <artifactId>protobuf-java</artifactId>
- <scope>provided</scope>
- </dependency>
-
- <dependency>
- <groupId>org.apache.thrift</groupId>
- <artifactId>libthrift</artifactId>
- <scope>provided</scope>
- </dependency>
-
- <dependency>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- <scope>provided</scope>
- </dependency>
-
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- <scope>provided</scope>
- </dependency>
-
- <!-- Used by LocalJobRunner in integration tests -->
- <dependency>
- <groupId>commons-httpclient</groupId>
- <artifactId>commons-httpclient</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.apache.crunch</groupId>
- <artifactId>crunch-test</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.mockito</groupId>
- <artifactId>mockito-all</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.hamcrest</groupId>
- <artifactId>hamcrest-all</artifactId>
- <scope>test</scope>
- </dependency>
-
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.codehaus.mojo</groupId>
- <artifactId>build-helper-maven-plugin</artifactId>
- </plugin>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-failsafe-plugin</artifactId>
- </plugin>
- <plugin>
- <groupId>org.apache.avro</groupId>
- <artifactId>avro-maven-plugin</artifactId>
- <executions>
- <execution>
- <id>schemas</id>
- <phase>generate-sources</phase>
- <goals>
- <goal>schema</goal>
- </goals>
- <configuration>
- <testSourceDirectory>${project.basedir}/src/test/avro/</testSourceDirectory>
- <testOutputDirectory>target/generated-test-sources/</testOutputDirectory>
- </configuration>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </build>
-
-</project>
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/CancelJobsIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/CancelJobsIT.java b/crunch/src/it/java/org/apache/crunch/CancelJobsIT.java
deleted file mode 100644
index ff01a2f..0000000
--- a/crunch/src/it/java/org/apache/crunch/CancelJobsIT.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.To;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.junit.Rule;
-import org.junit.Test;
-
-/**
- *
- */
-public class CancelJobsIT {
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testRun() throws Exception {
- PipelineExecution pe = run();
- pe.waitUntilDone();
- PipelineResult pr = pe.getResult();
- assertEquals(PipelineExecution.Status.SUCCEEDED, pe.getStatus());
- assertEquals(2, pr.getStageResults().size());
- }
-
- @Test
- public void testKill() throws Exception {
- PipelineExecution pe = run();
- pe.kill();
- pe.waitUntilDone();
- assertEquals(PipelineExecution.Status.KILLED, pe.getStatus());
- }
-
- @Test
- public void testKillMultipleTimes() throws Exception {
- PipelineExecution pe = run();
- for (int i = 0; i < 10; i++) {
- pe.kill();
- }
- pe.waitUntilDone();
- assertEquals(PipelineExecution.Status.KILLED, pe.getStatus());
- }
-
- @Test
- public void testKillAfterDone() throws Exception {
- PipelineExecution pe = run();
- pe.waitUntilDone();
- assertEquals(PipelineExecution.Status.SUCCEEDED, pe.getStatus());
- pe.kill(); // expect no-op
- assertEquals(PipelineExecution.Status.SUCCEEDED, pe.getStatus());
- }
-
- public PipelineExecution run() throws IOException {
- String shakes = tmpDir.copyResourceFileName("shakes.txt");
- String out = tmpDir.getFileName("cancel");
- Pipeline p = new MRPipeline(CancelJobsIT.class, tmpDir.getDefaultConfiguration());
- PCollection<String> words = p.readTextFile(shakes);
- p.write(words.count().top(20), To.textFile(out));
- return p.runAsync(); // need to hack to slow down job start up if this test becomes flaky.
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/CleanTextIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/CleanTextIT.java b/crunch/src/it/java/org/apache/crunch/CleanTextIT.java
deleted file mode 100644
index 2f4004e..0000000
--- a/crunch/src/it/java/org/apache/crunch/CleanTextIT.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.File;
-import java.nio.charset.Charset;
-import java.util.List;
-
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.To;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.avro.Avros;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.io.Files;
-
-/**
- *
- */
-public class CleanTextIT {
-
- private static final int LINES_IN_SHAKES = 3667;
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- static DoFn<String, String> CLEANER = new DoFn<String, String>() {
- @Override
- public void process(String input, Emitter<String> emitter) {
- emitter.emit(input.toLowerCase());
- }
- };
-
- static DoFn<String, String> SPLIT = new DoFn<String, String>() {
- @Override
- public void process(String input, Emitter<String> emitter) {
- for (String word : input.split("\\S+")) {
- if (!word.isEmpty()) {
- emitter.emit(word);
- }
- }
- }
- };
-
- @Test
- public void testMapSideOutputs() throws Exception {
- Pipeline pipeline = new MRPipeline(CleanTextIT.class, tmpDir.getDefaultConfiguration());
- String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
- PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
-
- PCollection<String> cleanShakes = shakespeare.parallelDo(CLEANER, Avros.strings());
- File cso = tmpDir.getFile("cleanShakes");
- cleanShakes.write(To.textFile(cso.getAbsolutePath()));
-
- File wc = tmpDir.getFile("wordCounts");
- cleanShakes.parallelDo(SPLIT, Avros.strings()).count().write(To.textFile(wc.getAbsolutePath()));
- pipeline.done();
-
- File cleanFile = new File(cso, "part-m-00000");
- List<String> lines = Files.readLines(cleanFile, Charset.defaultCharset());
- assertEquals(LINES_IN_SHAKES, lines.size());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/CollectionPObjectIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/CollectionPObjectIT.java b/crunch/src/it/java/org/apache/crunch/CollectionPObjectIT.java
deleted file mode 100644
index 7e0c75c..0000000
--- a/crunch/src/it/java/org/apache/crunch/CollectionPObjectIT.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-import java.lang.String;
-import java.util.Collection;
-
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PObject;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.materialize.pobject.CollectionPObject;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.junit.Rule;
-import org.junit.Test;
-
-@SuppressWarnings("serial")
-public class CollectionPObjectIT {
-
- private static final int LINES_IN_SHAKES = 3667;
-
- private static final String FIRST_SHAKESPEARE_LINE =
- "***The Project Gutenberg's Etext of Shakespeare's First Folio***";
-
- private static final String LAST_SHAKESPEARE_LINE =
- "FINIS. THE TRAGEDIE OF MACBETH.";
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testPObjectMRPipeline() throws IOException {
- runPObject(new MRPipeline(CollectionPObjectIT.class, tmpDir.getDefaultConfiguration()));
- }
-
- @Test
- public void testAsCollectionMRPipeline() throws IOException {
- runAsCollection(new MRPipeline(CollectionPObjectIT.class, tmpDir.getDefaultConfiguration()));
- }
-
- @Test
- public void testPObjectMemPipeline() throws IOException {
- runPObject(MemPipeline.getInstance());
- }
-
- @Test
- public void testAsCollectionMemPipeline() throws IOException {
- runAsCollection(MemPipeline.getInstance());
- }
-
- private PCollection<String> getPCollection(Pipeline pipeline) throws IOException {
- String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
- PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
- return shakespeare;
- }
-
- private void verifyLines(String[] lines) {
- assertEquals("Not enough lines in Shakespeare.", LINES_IN_SHAKES, lines.length);
- assertEquals("First line in Shakespeare is wrong.", FIRST_SHAKESPEARE_LINE, lines[0]);
- assertEquals("Last line in Shakespeare is wrong.", LAST_SHAKESPEARE_LINE,
- lines[lines.length - 1]);
- }
-
- public void runPObject(Pipeline pipeline) throws IOException {
- PCollection<String> shakespeare = getPCollection(pipeline);
- PObject<Collection<String>> linesP = new CollectionPObject<String>(shakespeare);
- String[] lines = new String[LINES_IN_SHAKES];
- lines = linesP.getValue().toArray(lines);
- verifyLines(lines);
- }
-
- public void runAsCollection(Pipeline pipeline) throws IOException {
- PCollection<String> shakespeare = getPCollection(pipeline);
- String[] lines = new String[LINES_IN_SHAKES];
- lines = shakespeare.asCollection().getValue().toArray(lines);
- verifyLines(lines);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/CollectionsIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/CollectionsIT.java b/crunch/src/it/java/org/apache/crunch/CollectionsIT.java
deleted file mode 100644
index 17d0cae..0000000
--- a/crunch/src/it/java/org/apache/crunch/CollectionsIT.java
+++ /dev/null
@@ -1,117 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-import java.util.Collection;
-
-import org.apache.crunch.fn.Aggregators.SimpleAggregator;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-
-@SuppressWarnings("serial")
-public class CollectionsIT {
-
- private static class AggregateStringListFn extends SimpleAggregator<Collection<String>> {
- private final Collection<String> rtn = Lists.newArrayList();
-
- @Override
- public void reset() {
- rtn.clear();
- }
-
- @Override
- public void update(Collection<String> values) {
- rtn.addAll(values);
- }
-
- @Override
- public Iterable<Collection<String>> results() {
- return ImmutableList.of(rtn);
- }
- }
-
- private static PTable<String, Collection<String>> listOfCharcters(PCollection<String> lines, PTypeFamily typeFamily) {
-
- return lines.parallelDo(new DoFn<String, Pair<String, Collection<String>>>() {
- @Override
- public void process(String line, Emitter<Pair<String, Collection<String>>> emitter) {
- for (String word : line.split("\\s+")) {
- Collection<String> characters = Lists.newArrayList();
- for (char c : word.toCharArray()) {
- characters.add(String.valueOf(c));
- }
- emitter.emit(Pair.of(word, characters));
- }
- }
- }, typeFamily.tableOf(typeFamily.strings(), typeFamily.collections(typeFamily.strings())))
- .groupByKey().combineValues(new AggregateStringListFn());
- }
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testWritables() throws IOException {
- run(new MRPipeline(CollectionsIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testAvro() throws IOException {
- run(new MRPipeline(CollectionsIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
- }
-
- @Test
- public void testInMemoryWritables() throws IOException {
- run(MemPipeline.getInstance(), WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testInMemoryAvro() throws IOException {
- run(MemPipeline.getInstance(), AvroTypeFamily.getInstance());
- }
-
- public void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
- String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
-
- PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
- Iterable<Pair<String, Collection<String>>> lines = listOfCharcters(shakespeare, typeFamily).materialize();
-
- boolean passed = false;
- for (Pair<String, Collection<String>> line : lines) {
- if (line.first().startsWith("yellow")) {
- passed = true;
- break;
- }
- }
- pipeline.done();
- assertTrue(passed);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/CollectionsLengthIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/CollectionsLengthIT.java b/crunch/src/it/java/org/apache/crunch/CollectionsLengthIT.java
deleted file mode 100644
index 3a38b92..0000000
--- a/crunch/src/it/java/org/apache/crunch/CollectionsLengthIT.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-import java.lang.Long;
-
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.Rule;
-import org.junit.Test;
-
-@SuppressWarnings("serial")
-public class CollectionsLengthIT {
-
- public static final Long LINES_IN_SHAKESPEARE = 3667L;
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testWritables() throws IOException {
- run(new MRPipeline(CollectionsIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testAvro() throws IOException {
- run(new MRPipeline(CollectionsIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
- }
-
- @Test
- public void testInMemoryWritables() throws IOException {
- run(MemPipeline.getInstance(), WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testInMemoryAvro() throws IOException {
- run(MemPipeline.getInstance(), AvroTypeFamily.getInstance());
- }
-
- public void run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
- String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
-
- PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
- Long length = shakespeare.length().getValue();
- assertEquals("Incorrect length for shakespear PCollection.", LINES_IN_SHAKESPEARE, length);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java b/crunch/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java
deleted file mode 100644
index f1323ca..0000000
--- a/crunch/src/it/java/org/apache/crunch/DeepCopyCustomTuplesIT.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.apache.crunch.types.avro.Avros.*;
-import static org.junit.Assert.assertEquals;
-
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PType;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Iterables;
-
-/**
- *
- */
-public class DeepCopyCustomTuplesIT {
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- public static class PID extends Pair<Integer, String> {
- public PID(Integer first, String second) {
- super(first, second);
- }
- }
-
- private static PType<PID> pids = tuples(PID.class, ints(), strings());
-
- @Test
- public void testDeepCopyCustomTuple() throws Exception {
- Pipeline p = new MRPipeline(DeepCopyCustomTuplesIT.class, tmpDir.getDefaultConfiguration());
- String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
- PCollection<String> shakes = p.readTextFile(shakesInputPath);
- Iterable<String> out = shakes
- .parallelDo(new PreProcFn(), tableOf(ints(), pairs(ints(), pids)))
- .groupByKey()
- .parallelDo(new PostProcFn(), strings())
- .materialize();
- assertEquals(65, Iterables.size(out));
- p.done();
- }
-
- private static class PreProcFn extends MapFn<String, Pair<Integer, Pair<Integer, PID>>> {
- private int counter = 0;
- @Override
- public Pair<Integer, Pair<Integer, PID>> map(String input) {
- return Pair.of(counter++, Pair.of(counter++, new PID(input.length(), input)));
- }
- };
-
- private static class PostProcFn extends DoFn<Pair<Integer, Iterable<Pair<Integer, PID>>>, String> {
- @Override
- public void process(Pair<Integer, Iterable<Pair<Integer, PID>>> input, Emitter<String> emitter) {
- for (Pair<Integer, PID> p : input.second()) {
- if (p.second().first() > 0 && p.second().first() < 10) {
- emitter.emit(p.second().second());
- }
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/EnumPairIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/EnumPairIT.java b/crunch/src/it/java/org/apache/crunch/EnumPairIT.java
deleted file mode 100644
index 1d0974e..0000000
--- a/crunch/src/it/java/org/apache/crunch/EnumPairIT.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-import java.io.Serializable;
-
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypes;
-import org.apache.crunch.types.writable.Writables;
-import org.junit.Rule;
-import org.junit.Test;
-
-public class EnumPairIT implements Serializable {
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
- static enum etypes {
- type1,
- }
-
- @Test
- public void testEnumPTypes() throws IOException {
- String inputFile1 = tmpDir.copyResourceFileName("set1.txt");
- Pipeline pipeline = new MRPipeline(EnumPairIT.class);
- PCollection<String> set1 = pipeline.readTextFile(inputFile1);
- PTable<String, etypes> data = set1.parallelDo(new DoFn<String, Pair<String, etypes>>() {
- @Override
- public void process(String input, Emitter<Pair<String, etypes>> emitter) {
- emitter.emit(new Pair<String, etypes>(input, etypes.type1));
- }
- }, Writables.tableOf(Writables.strings(), PTypes.enums(etypes.class, set1.getTypeFamily())));
-
- Iterable<Pair<String, etypes>> materialized = data.materialize();
- pipeline.run();
- for (Pair<String, etypes> pair : materialized) {
- assertEquals(etypes.type1, pair.second());
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/FirstElementPObjectIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/FirstElementPObjectIT.java b/crunch/src/it/java/org/apache/crunch/FirstElementPObjectIT.java
deleted file mode 100644
index d985e10..0000000
--- a/crunch/src/it/java/org/apache/crunch/FirstElementPObjectIT.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-import java.lang.String;
-
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PObject;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.materialize.pobject.FirstElementPObject;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.junit.Rule;
-import org.junit.Test;
-
-@SuppressWarnings("serial")
-public class FirstElementPObjectIT {
-
- private static final String FIRST_SHAKESPEARE_LINE =
- "***The Project Gutenberg's Etext of Shakespeare's First Folio***";
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testMRPipeline() throws IOException {
- run(new MRPipeline(FirstElementPObjectIT.class, tmpDir.getDefaultConfiguration()));
- }
-
- @Test
- public void testInMemoryPipeline() throws IOException {
- run(MemPipeline.getInstance());
- }
-
- public void run(Pipeline pipeline) throws IOException {
- String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
- PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
- PObject<String> firstLine = new FirstElementPObject<String>(shakespeare);
- String first = firstLine.getValue();
- assertEquals("First line in Shakespeare is wrong.", FIRST_SHAKESPEARE_LINE, first);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/IterableReuseProtectionIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/IterableReuseProtectionIT.java b/crunch/src/it/java/org/apache/crunch/IterableReuseProtectionIT.java
deleted file mode 100644
index da487eb..0000000
--- a/crunch/src/it/java/org/apache/crunch/IterableReuseProtectionIT.java
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.IOException;
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.crunch.fn.IdentityFn;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.writable.Writables;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-/**
- * Verify that calling the iterator method on a Reducer-based Iterable
- * is forcefully disallowed.
- */
-public class IterableReuseProtectionIT {
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
-
- public void checkIteratorReuse(Pipeline pipeline) throws IOException {
- Iterable<String> values = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt"))
- .by(IdentityFn.<String>getInstance(), Writables.strings())
- .groupByKey()
- .combineValues(new TestIterableReuseFn())
- .values().materialize();
-
- List<String> valueList = Lists.newArrayList(values);
- Collections.sort(valueList);
- assertEquals(Lists.newArrayList("a", "b", "c", "e"), valueList);
- }
-
- @Test
- public void testIteratorReuse_MRPipeline() throws IOException {
- checkIteratorReuse(new MRPipeline(IterableReuseProtectionIT.class, tmpDir.getDefaultConfiguration()));
- }
-
- @Test
- public void testIteratorReuse_InMemoryPipeline() throws IOException {
- checkIteratorReuse(MemPipeline.getInstance());
- }
-
- static class TestIterableReuseFn extends CombineFn<String, String> {
-
- @Override
- public void process(Pair<String, Iterable<String>> input, Emitter<Pair<String, String>> emitter) {
- StringBuilder combinedBuilder = new StringBuilder();
- for (String v : input.second()) {
- combinedBuilder.append(v);
- }
-
- try {
- input.second().iterator();
- throw new RuntimeException("Second call to iterator should throw an exception");
- } catch (IllegalStateException e) {
- // Expected situation
- }
- emitter.emit(Pair.of(input.first(), combinedBuilder.toString()));
- }
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/MRPipelineIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/MRPipelineIT.java b/crunch/src/it/java/org/apache/crunch/MRPipelineIT.java
deleted file mode 100644
index 7670e88..0000000
--- a/crunch/src/it/java/org/apache/crunch/MRPipelineIT.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.Serializable;
-
-import org.apache.crunch.fn.FilterFns;
-import org.apache.crunch.fn.IdentityFn;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.To;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.writable.Writables;
-import org.junit.Rule;
-import org.junit.Test;
-
-public class MRPipelineIT implements Serializable {
- @Rule
- public transient TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void materializedColShouldBeWritten() throws Exception {
- File textFile = tmpDir.copyResourceFile("shakes.txt");
- Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());
- PCollection<String> genericCollection = pipeline.readTextFile(textFile.getAbsolutePath());
- pipeline.run();
- PCollection<String> filter = genericCollection.filter("Filtering data", FilterFns.<String>ACCEPT_ALL());
- filter.materialize();
- pipeline.run();
- File file = tmpDir.getFile("output.txt");
- Target outFile = To.textFile(file.getAbsolutePath());
- PCollection<String> write = filter.write(outFile);
- write.materialize();
- pipeline.run();
- }
-
-
-
- @Test
- public void testPGroupedTableToMultipleOutputs() throws IOException{
- Pipeline pipeline = new MRPipeline(MRPipelineIT.class, tmpDir.getDefaultConfiguration());
- PGroupedTable<String, String> groupedLineTable = pipeline.readTextFile(tmpDir.copyResourceFileName("set1.txt")).by(IdentityFn.<String>getInstance(), Writables.strings()).groupByKey();
-
- PTable<String, String> ungroupedTableA = groupedLineTable.ungroup();
- PTable<String, String> ungroupedTableB = groupedLineTable.ungroup();
-
- File outputDirA = tmpDir.getFile("output_a");
- File outputDirB = tmpDir.getFile("output_b");
-
- pipeline.writeTextFile(ungroupedTableA, outputDirA.getAbsolutePath());
- pipeline.writeTextFile(ungroupedTableB, outputDirB.getAbsolutePath());
- pipeline.done();
-
- // Verify that output from a single PGroupedTable can be sent to multiple collections
- assertTrue(new File(outputDirA, "part-r-00000").exists());
- assertTrue(new File(outputDirB, "part-r-00000").exists());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/MapPObjectIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/MapPObjectIT.java b/crunch/src/it/java/org/apache/crunch/MapPObjectIT.java
deleted file mode 100644
index c48284f..0000000
--- a/crunch/src/it/java/org/apache/crunch/MapPObjectIT.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static junit.framework.Assert.assertEquals;
-
-import java.io.IOException;
-import java.util.Map;
-
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.materialize.pobject.MapPObject;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableList;
-
-public class MapPObjectIT {
-
- static final ImmutableList<Pair<Integer, String>> kvPairs = ImmutableList.of(Pair.of(0, "a"), Pair.of(1, "b"),
- Pair.of(2, "c"), Pair.of(3, "e"));
-
- public void assertMatches(Map<Integer, String> m) {
- for (Integer k : m.keySet()) {
- assertEquals(kvPairs.get(k).second(), m.get(k));
- }
- }
-
- private static class Set1Mapper extends MapFn<String, Pair<Integer, String>> {
- @Override
- public Pair<Integer, String> map(String input) {
-
- int k = -1;
- if (input.equals("a"))
- k = 0;
- else if (input.equals("b"))
- k = 1;
- else if (input.equals("c"))
- k = 2;
- else if (input.equals("e"))
- k = 3;
- return Pair.of(k, input);
- }
- }
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testMemMapPObject() {
- PTable<Integer, String> table = MemPipeline.tableOf(kvPairs);
- PObject<Map<Integer, String>> map = new MapPObject<Integer, String>(table);
- assertMatches(map.getValue());
- }
-
- @Test
- public void testMemAsMap() {
- PTable<Integer, String> table = MemPipeline.tableOf(kvPairs);
- assertMatches(table.asMap().getValue());
- }
-
- private PTable<Integer, String> getMRPTable() throws IOException {
- Pipeline p = new MRPipeline(MaterializeToMapIT.class, tmpDir.getDefaultConfiguration());
- String inputFile = tmpDir.copyResourceFileName("set1.txt");
- PCollection<String> c = p.readTextFile(inputFile);
- PTypeFamily tf = c.getTypeFamily();
- PTable<Integer, String> table = c.parallelDo(new Set1Mapper(), tf.tableOf(tf.ints(),
- tf.strings()));
- return table;
- }
-
- @Test
- public void testMRMapPObject() throws IOException {
- PTable<Integer, String> table = getMRPTable();
- PObject<Map<Integer, String>> map = new MapPObject<Integer, String>(table);
- assertMatches(map.getValue());
- }
-
- @Test
- public void testMRAsMap() throws IOException {
- PTable<Integer, String> table = getMRPTable();
- assertMatches(table.asMap().getValue());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/MapsIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/MapsIT.java b/crunch/src/it/java/org/apache/crunch/MapsIT.java
deleted file mode 100644
index 5b3187b..0000000
--- a/crunch/src/it/java/org/apache/crunch/MapsIT.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.hamcrest.Matchers.is;
-import static org.junit.Assert.assertThat;
-
-import java.util.Map;
-
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Maps;
-
-public class MapsIT {
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testWritables() throws Exception {
- run(WritableTypeFamily.getInstance(), tmpDir);
- }
-
- @Test
- public void testAvros() throws Exception {
- run(AvroTypeFamily.getInstance(), tmpDir);
- }
-
- public static void run(PTypeFamily typeFamily, TemporaryPath tmpDir) throws Exception {
- Pipeline pipeline = new MRPipeline(MapsIT.class, tmpDir.getDefaultConfiguration());
- String shakesInputPath = tmpDir.copyResourceFileName("shakes.txt");
- PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
- Iterable<Pair<String, Map<String, Long>>> output = shakespeare
- .parallelDo(new DoFn<String, Pair<String, Map<String, Long>>>() {
- @Override
- public void process(String input, Emitter<Pair<String, Map<String, Long>>> emitter) {
- String last = null;
- for (String word : input.toLowerCase().split("\\W+")) {
- if (!word.isEmpty()) {
- String firstChar = word.substring(0, 1);
- if (last != null) {
- Map<String, Long> cc = ImmutableMap.of(firstChar, 1L);
- emitter.emit(Pair.of(last, cc));
- }
- last = firstChar;
- }
- }
- }
- }, typeFamily.tableOf(typeFamily.strings(), typeFamily.maps(typeFamily.longs()))).groupByKey()
- .combineValues(new CombineFn<String, Map<String, Long>>() {
- @Override
- public void process(Pair<String, Iterable<Map<String, Long>>> input,
- Emitter<Pair<String, Map<String, Long>>> emitter) {
- Map<String, Long> agg = Maps.newHashMap();
- for (Map<String, Long> in : input.second()) {
- for (Map.Entry<String, Long> e : in.entrySet()) {
- if (!agg.containsKey(e.getKey())) {
- agg.put(e.getKey(), e.getValue());
- } else {
- agg.put(e.getKey(), e.getValue() + agg.get(e.getKey()));
- }
- }
- }
- emitter.emit(Pair.of(input.first(), agg));
- }
- }).materialize();
-
- boolean passed = false;
- for (Pair<String, Map<String, Long>> v : output) {
- if (v.first().equals("k") && v.second().get("n") == 8L) {
- passed = true;
- break;
- }
- }
- pipeline.done();
-
- assertThat(passed, is(true));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/MaterializeIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/MaterializeIT.java b/crunch/src/it/java/org/apache/crunch/MaterializeIT.java
deleted file mode 100644
index d064993..0000000
--- a/crunch/src/it/java/org/apache/crunch/MaterializeIT.java
+++ /dev/null
@@ -1,139 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static junit.framework.Assert.assertEquals;
-import static junit.framework.Assert.assertTrue;
-
-import java.io.IOException;
-import java.util.List;
-
-import org.apache.crunch.fn.FilterFns;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.Person;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.junit.Assume;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-
-public class MaterializeIT {
-
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testMaterializeInput_Writables() throws IOException {
- runMaterializeInput(new MRPipeline(MaterializeIT.class, tmpDir.getDefaultConfiguration()),
- WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testMaterializeInput_Avro() throws IOException {
- runMaterializeInput(new MRPipeline(MaterializeIT.class, tmpDir.getDefaultConfiguration()),
- AvroTypeFamily.getInstance());
- }
-
- @Test
- public void testMaterializeInput_InMemoryWritables() throws IOException {
- runMaterializeInput(MemPipeline.getInstance(), WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testMaterializeInput_InMemoryAvro() throws IOException {
- runMaterializeInput(MemPipeline.getInstance(), AvroTypeFamily.getInstance());
- }
-
- @Test
- public void testMaterializeEmptyIntermediate_Writables() throws IOException {
- runMaterializeEmptyIntermediate(
- new MRPipeline(MaterializeIT.class, tmpDir.getDefaultConfiguration()),
- WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testMaterializeEmptyIntermediate_Avro() throws IOException {
- runMaterializeEmptyIntermediate(
- new MRPipeline(MaterializeIT.class, tmpDir.getDefaultConfiguration()),
- AvroTypeFamily.getInstance());
- }
-
- @Test
- public void testMaterializeEmptyIntermediate_InMemoryWritables() throws IOException {
- runMaterializeEmptyIntermediate(MemPipeline.getInstance(), WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testMaterializeEmptyIntermediate_InMemoryAvro() throws IOException {
- runMaterializeEmptyIntermediate(MemPipeline.getInstance(), AvroTypeFamily.getInstance());
- }
-
- public void runMaterializeInput(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
- List<String> expectedContent = Lists.newArrayList("b", "c", "a", "e");
- String inputPath = tmpDir.copyResourceFileName("set1.txt");
-
- PCollection<String> lines = pipeline.readTextFile(inputPath);
- assertEquals(expectedContent, Lists.newArrayList(lines.materialize()));
- pipeline.done();
- }
-
- public void runMaterializeEmptyIntermediate(Pipeline pipeline, PTypeFamily typeFamily)
- throws IOException {
- String inputPath = tmpDir.copyResourceFileName("set1.txt");
- PCollection<String> empty = pipeline.readTextFile(inputPath).filter(FilterFns.<String>REJECT_ALL());
-
- assertTrue(Lists.newArrayList(empty.materialize()).isEmpty());
- pipeline.done();
- }
-
- static class StringToStringWrapperPersonPairMapFn extends MapFn<String, Pair<StringWrapper, Person>> {
-
- @Override
- public Pair<StringWrapper, Person> map(String input) {
- Person person = new Person();
- person.name = input;
- person.age = 42;
- person.siblingnames = Lists.<CharSequence> newArrayList();
- return Pair.of(new StringWrapper(input), person);
- }
-
- }
-
- @Test
- public void testMaterializeAvroPersonAndReflectsPair_GroupedTable() throws IOException {
- Assume.assumeTrue(Avros.CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS);
- Pipeline pipeline = new MRPipeline(MaterializeIT.class);
- List<Pair<StringWrapper, Person>> pairList = Lists.newArrayList(pipeline
- .readTextFile(tmpDir.copyResourceFileName("set1.txt"))
- .parallelDo(new StringToStringWrapperPersonPairMapFn(),
- Avros.pairs(Avros.reflects(StringWrapper.class), Avros.records(Person.class)))
- .materialize());
-
- // We just need to make sure this doesn't crash
- assertEquals(4, pairList.size());
-
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/MaterializeToMapIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/MaterializeToMapIT.java b/crunch/src/it/java/org/apache/crunch/MaterializeToMapIT.java
deleted file mode 100644
index 7fef30e..0000000
--- a/crunch/src/it/java/org/apache/crunch/MaterializeToMapIT.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static junit.framework.Assert.assertEquals;
-
-import java.io.IOException;
-import java.util.Map;
-
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.ImmutableList;
-
-public class MaterializeToMapIT {
-
- static final ImmutableList<Pair<Integer, String>> kvPairs = ImmutableList.of(Pair.of(0, "a"), Pair.of(1, "b"),
- Pair.of(2, "c"), Pair.of(3, "e"));
-
- public void assertMatches(Map<Integer, String> m) {
- for (Integer k : m.keySet()) {
- assertEquals(kvPairs.get(k).second(), m.get(k));
- }
- }
-
- @Test
- public void testMemMaterializeToMap() {
- assertMatches(MemPipeline.tableOf(kvPairs).materializeToMap());
- }
-
- private static class Set1Mapper extends MapFn<String, Pair<Integer, String>> {
- @Override
- public Pair<Integer, String> map(String input) {
-
- int k = -1;
- if (input.equals("a"))
- k = 0;
- else if (input.equals("b"))
- k = 1;
- else if (input.equals("c"))
- k = 2;
- else if (input.equals("e"))
- k = 3;
- return Pair.of(k, input);
- }
- }
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- @Test
- public void testMRMaterializeToMap() throws IOException {
- Pipeline p = new MRPipeline(MaterializeToMapIT.class, tmpDir.getDefaultConfiguration());
- String inputFile = tmpDir.copyResourceFileName("set1.txt");
- PCollection<String> c = p.readTextFile(inputFile);
- PTypeFamily tf = c.getTypeFamily();
- PTable<Integer, String> t = c.parallelDo(new Set1Mapper(), tf.tableOf(tf.ints(), tf.strings()));
- Map<Integer, String> m = t.materializeToMap();
- assertMatches(m);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/MultipleOutputIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/MultipleOutputIT.java b/crunch/src/it/java/org/apache/crunch/MultipleOutputIT.java
deleted file mode 100644
index 1a85b6a..0000000
--- a/crunch/src/it/java/org/apache/crunch/MultipleOutputIT.java
+++ /dev/null
@@ -1,175 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.io.At;
-import org.apache.crunch.test.StringWrapper;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.avro.Avros;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.apache.crunch.types.writable.Writables;
-import org.junit.Rule;
-import org.junit.Test;
-
-import com.google.common.collect.Lists;
-import com.google.common.io.Files;
-
-public class MultipleOutputIT {
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- public static PCollection<String> evenCountLetters(PCollection<String> words, PTypeFamily typeFamily) {
- return words.parallelDo("even", new FilterFn<String>() {
-
- @Override
- public boolean accept(String input) {
- return input.length() % 2 == 0;
- }
- }, typeFamily.strings());
- }
-
- public static PCollection<String> oddCountLetters(PCollection<String> words, PTypeFamily typeFamily) {
- return words.parallelDo("odd", new FilterFn<String>() {
-
- @Override
- public boolean accept(String input) {
- return input.length() % 2 != 0;
- }
- }, typeFamily.strings());
-
- }
-
- public static PTable<String, Long> substr(PTable<String, Long> ptable) {
- return ptable.parallelDo(new DoFn<Pair<String, Long>, Pair<String, Long>>() {
- public void process(Pair<String, Long> input, Emitter<Pair<String, Long>> emitter) {
- if (input.first().length() > 0) {
- emitter.emit(Pair.of(input.first().substring(0, 1), input.second()));
- }
- }
- }, ptable.getPTableType());
- }
-
- @Test
- public void testWritables() throws IOException {
- run(new MRPipeline(MultipleOutputIT.class, tmpDir.getDefaultConfiguration()), WritableTypeFamily.getInstance());
- }
-
- @Test
- public void testAvro() throws IOException {
- run(new MRPipeline(MultipleOutputIT.class, tmpDir.getDefaultConfiguration()), AvroTypeFamily.getInstance());
- }
-
- @Test
- public void testParallelDosFused() throws IOException {
-
- PipelineResult result = run(new MRPipeline(MultipleOutputIT.class, tmpDir.getDefaultConfiguration()),
- WritableTypeFamily.getInstance());
-
- // Ensure our multiple outputs were fused into a single job.
- assertEquals("parallel Dos not fused into a single job", 1, result.getStageResults().size());
- }
-
- public PipelineResult run(Pipeline pipeline, PTypeFamily typeFamily) throws IOException {
- String inputPath = tmpDir.copyResourceFileName("letters.txt");
- String outputPathEven = tmpDir.getFileName("even");
- String outputPathOdd = tmpDir.getFileName("odd");
-
- PCollection<String> words = pipeline.read(At.textFile(inputPath, typeFamily.strings()));
-
- PCollection<String> evenCountWords = evenCountLetters(words, typeFamily);
- PCollection<String> oddCountWords = oddCountLetters(words, typeFamily);
- pipeline.writeTextFile(evenCountWords, outputPathEven);
- pipeline.writeTextFile(oddCountWords, outputPathOdd);
-
- PipelineResult result = pipeline.done();
-
- checkFileContents(outputPathEven, Arrays.asList("bb"));
- checkFileContents(outputPathOdd, Arrays.asList("a"));
-
- return result;
- }
-
- /**
- * Mutates the state of an input and then emits the mutated object.
- */
- static class AppendFn extends DoFn<StringWrapper, StringWrapper> {
-
- private String value;
-
- public AppendFn(String value) {
- this.value = value;
- }
-
- @Override
- public void process(StringWrapper input, Emitter<StringWrapper> emitter) {
- input.setValue(input.getValue() + value);
- emitter.emit(input);
- }
-
- }
-
- /**
- * Fusing multiple pipelines has a risk of running into object reuse bugs.
- * This test verifies that mutating the state of an object that is passed
- * through multiple streams of a pipeline doesn't allow one stream to affect
- * another.
- */
- @Test
- public void testFusedMappersObjectReuseBug() throws IOException {
- Pipeline pipeline = new MRPipeline(MultipleOutputIT.class, tmpDir.getDefaultConfiguration());
- PCollection<StringWrapper> stringWrappers = pipeline.readTextFile(tmpDir.copyResourceFileName("set2.txt"))
- .parallelDo(new StringWrapper.StringToStringWrapperMapFn(), Avros.reflects(StringWrapper.class));
-
- PCollection<String> stringsA = stringWrappers.parallelDo(new AppendFn("A"), stringWrappers.getPType())
- .parallelDo(new StringWrapper.StringWrapperToStringMapFn(), Writables.strings());
- PCollection<String> stringsB = stringWrappers.parallelDo(new AppendFn("B"), stringWrappers.getPType())
- .parallelDo(new StringWrapper.StringWrapperToStringMapFn(), Writables.strings());
-
- String outputA = tmpDir.getFileName("stringsA");
- String outputB = tmpDir.getFileName("stringsB");
-
- pipeline.writeTextFile(stringsA, outputA);
- pipeline.writeTextFile(stringsB, outputB);
- PipelineResult pipelineResult = pipeline.done();
-
- // Make sure fusing did actually occur
- assertEquals(1, pipelineResult.getStageResults().size());
-
- checkFileContents(outputA, Lists.newArrayList("cA", "dA", "aA"));
- checkFileContents(outputB, Lists.newArrayList("cB", "dB", "aB"));
-
- }
-
- private void checkFileContents(String filePath, List<String> expected) throws IOException {
- File outputFile = new File(filePath, "part-m-00000");
- List<String> lines = Files.readLines(outputFile, Charset.defaultCharset());
- assertEquals(expected, lines);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/java/org/apache/crunch/PCollectionGetSizeIT.java
----------------------------------------------------------------------
diff --git a/crunch/src/it/java/org/apache/crunch/PCollectionGetSizeIT.java b/crunch/src/it/java/org/apache/crunch/PCollectionGetSizeIT.java
deleted file mode 100644
index 44eb897..0000000
--- a/crunch/src/it/java/org/apache/crunch/PCollectionGetSizeIT.java
+++ /dev/null
@@ -1,151 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import static com.google.common.collect.Lists.newArrayList;
-import static org.apache.crunch.io.At.sequenceFile;
-import static org.apache.crunch.io.At.textFile;
-import static org.apache.crunch.types.writable.Writables.strings;
-import static org.hamcrest.Matchers.is;
-import static org.junit.Assert.assertThat;
-
-import java.io.IOException;
-
-import org.apache.crunch.fn.FilterFns;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.test.TemporaryPath;
-import org.apache.crunch.test.TemporaryPaths;
-import org.junit.Before;
-import org.junit.Ignore;
-import org.junit.Rule;
-import org.junit.Test;
-
-public class PCollectionGetSizeIT {
- @Rule
- public TemporaryPath tmpDir = TemporaryPaths.create();
-
- private String emptyInputPath;
- private String nonEmptyInputPath;
- private String outputPath;
-
- @Before
- public void setUp() throws IOException {
- emptyInputPath = tmpDir.copyResourceFileName("emptyTextFile.txt");
- nonEmptyInputPath = tmpDir.copyResourceFileName("set1.txt");
- outputPath = tmpDir.getFileName("output");
- }
-
- @Test
- public void testGetSizeOfEmptyInput_MRPipeline() throws IOException {
- testCollectionGetSizeOfEmptyInput(new MRPipeline(this.getClass(), tmpDir.getDefaultConfiguration()));
- }
-
- @Test
- public void testGetSizeOfEmptyInput_MemPipeline() throws IOException {
- testCollectionGetSizeOfEmptyInput(MemPipeline.getInstance());
- }
-
- private void testCollectionGetSizeOfEmptyInput(Pipeline pipeline) throws IOException {
-
- assertThat(pipeline.read(textFile(emptyInputPath)).getSize(), is(0L));
- }
-
- @Test
- public void testMaterializeEmptyInput_MRPipeline() throws IOException {
- testMaterializeEmptyInput(new MRPipeline(this.getClass(), tmpDir.getDefaultConfiguration()));
- }
-
- @Test
- public void testMaterializeEmptyImput_MemPipeline() throws IOException {
- testMaterializeEmptyInput(MemPipeline.getInstance());
- }
-
- private void testMaterializeEmptyInput(Pipeline pipeline) throws IOException {
- assertThat(newArrayList(pipeline.readTextFile(emptyInputPath).materialize().iterator()).size(), is(0));
- }
-
- @Test
- public void testGetSizeOfEmptyIntermediatePCollection_MRPipeline() throws IOException {
-
- PCollection<String> emptyIntermediate = createPesistentEmptyIntermediate(
- new MRPipeline(this.getClass(), tmpDir.getDefaultConfiguration()));
-
- assertThat(emptyIntermediate.getSize(), is(0L));
- }
-
- @Test
- @Ignore("GetSize of a DoCollection is only an estimate based on scale factor, so we can't count on it being reported as 0")
- public void testGetSizeOfEmptyIntermediatePCollection_NoSave_MRPipeline() throws IOException {
-
- PCollection<String> data = new MRPipeline(this.getClass(), tmpDir.getDefaultConfiguration())
- .readTextFile(nonEmptyInputPath);
-
- PCollection<String> emptyPCollection = data.filter(FilterFns.<String>REJECT_ALL());
-
- assertThat(emptyPCollection.getSize(), is(0L));
- }
-
- @Test
- public void testGetSizeOfEmptyIntermediatePCollection_MemPipeline() {
-
- PCollection<String> emptyIntermediate = createPesistentEmptyIntermediate(MemPipeline.getInstance());
-
- assertThat(emptyIntermediate.getSize(), is(0L));
- }
-
- @Test
- public void testMaterializeOfEmptyIntermediatePCollection_MRPipeline() throws IOException {
-
- PCollection<String> emptyIntermediate = createPesistentEmptyIntermediate(
- new MRPipeline(this.getClass(), tmpDir.getDefaultConfiguration()));
-
- assertThat(newArrayList(emptyIntermediate.materialize()).size(), is(0));
- }
-
- @Test
- public void testMaterializeOfEmptyIntermediatePCollection_MemPipeline() {
-
- PCollection<String> emptyIntermediate = createPesistentEmptyIntermediate(MemPipeline.getInstance());
-
- assertThat(newArrayList(emptyIntermediate.materialize()).size(), is(0));
- }
-
- private PCollection<String> createPesistentEmptyIntermediate(Pipeline pipeline) {
-
- PCollection<String> data = pipeline.readTextFile(nonEmptyInputPath);
-
- PCollection<String> emptyPCollection = data.filter(FilterFns.<String>REJECT_ALL());
-
- emptyPCollection.write(sequenceFile(outputPath, strings()));
-
- pipeline.run();
-
- return pipeline.read(sequenceFile(outputPath, strings()));
- }
-
- @Test(expected = IllegalStateException.class)
- public void testExpectExceptionForGettingSizeOfNonExistingFile_MRPipeline() throws IOException {
- new MRPipeline(this.getClass(), tmpDir.getDefaultConfiguration()).readTextFile("non_existing.file").getSize();
- }
-
- @Test(expected = IllegalStateException.class)
- public void testExpectExceptionForGettingSizeOfNonExistingFile_MemPipeline() {
- MemPipeline.getInstance().readTextFile("non_existing.file").getSize();
- }
-}
[36/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/it/resources/urls.txt
----------------------------------------------------------------------
diff --git a/crunch-core/src/it/resources/urls.txt b/crunch-core/src/it/resources/urls.txt
new file mode 100644
index 0000000..827e711
--- /dev/null
+++ b/crunch-core/src/it/resources/urls.txt
@@ -0,0 +1,11 @@
+www.A.com www.B.com
+www.A.com www.C.com
+www.A.com www.D.com
+www.A.com www.E.com
+www.B.com www.D.com
+www.B.com www.E.com
+www.C.com www.D.com
+www.D.com www.B.com
+www.E.com www.A.com
+www.F.com www.B.com
+www.F.com www.C.com
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/Aggregator.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/Aggregator.java b/crunch-core/src/main/java/org/apache/crunch/Aggregator.java
new file mode 100644
index 0000000..432452b
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/Aggregator.java
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import java.io.Serializable;
+
+import org.apache.hadoop.conf.Configuration;
+
+
+/**
+ * Aggregate a sequence of values into a possibly smaller sequence of the same type.
+ *
+ * <p>In most cases, an Aggregator will turn multiple values into a single value,
+ * like creating a sum, finding the minimum or maximum, etc. In some cases
+ * (ie. finding the top K elements), an implementation may return more than
+ * one value. The {@link org.apache.crunch.fn.Aggregators} utility class contains
+ * factory methods for creating all kinds of pre-defined Aggregators that should
+ * cover the most common cases.</p>
+ *
+ * <p>Aggregator implementations should usually be <em>associative</em> and
+ * <em>commutative</em>, which makes their results deterministic. If your aggregation
+ * function isn't commutative, you can still use secondary sort to that effect.</p>
+ *
+ * <p>The lifecycle of an {@link Aggregator} always begins with you instantiating
+ * it and passing it to Crunch. When running your {@link Pipeline}, Crunch serializes
+ * the instance and deserializes it wherever it is needed on the cluster. This is how
+ * Crunch uses a deserialized instance:<p>
+ *
+ * <ol>
+ * <li>call {@link #initialize(Configuration)} once</li>
+ * <li>call {@link #reset()}
+ * <li>call {@link #update(Object)} multiple times until all values of a sequence
+ * have been aggregated</li>
+ * <li>call {@link #results()} to retrieve the aggregated result</li>
+ * <li>go back to step 2 until all sequences have been aggregated</li>
+ * </ol>
+ *
+ * @param <T> The value types to aggregate
+ */
+public interface Aggregator<T> extends Serializable {
+
+ /**
+ * Perform any setup of this instance that is required prior to processing
+ * inputs.
+ *
+ * @param conf Hadoop configuration
+ */
+ void initialize(Configuration conf);
+
+ /**
+ * Clears the internal state of this Aggregator and prepares it for the
+ * values associated with the next key.
+ *
+ * Depending on what you aggregate, this typically means setting a variable
+ * to zero or clearing a list. Failing to do this will yield wrong results!
+ */
+ void reset();
+
+ /**
+ * Incorporate the given value into the aggregate state maintained by this
+ * instance.
+ *
+ * @param value The value to add to the aggregated state
+ */
+ void update(T value);
+
+ /**
+ * Returns the current aggregated state of this instance.
+ */
+ Iterable<T> results();
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/CombineFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/CombineFn.java b/crunch-core/src/main/java/org/apache/crunch/CombineFn.java
new file mode 100644
index 0000000..71e8057
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/CombineFn.java
@@ -0,0 +1,1211 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import java.io.Serializable;
+import java.math.BigInteger;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.SortedSet;
+
+import org.apache.crunch.fn.Aggregators;
+import org.apache.crunch.util.Tuples;
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+/**
+ * A special {@link DoFn} implementation that converts an {@link Iterable} of
+ * values into a single value. If a {@code CombineFn} instance is used on a
+ * {@link PGroupedTable}, the function will be applied to the output of the map
+ * stage before the data is passed to the reducer, which can improve the runtime
+ * of certain classes of jobs.
+ * <p>
+ * Note that the incoming {@code Iterable} can only be used to create an
+ * {@code Iterator} once. Calling {@link Iterable#iterator()} method a second
+ * time will throw an {@link IllegalStateException}.
+ */
+public abstract class CombineFn<S, T> extends DoFn<Pair<S, Iterable<T>>, Pair<S, T>> {
+
+ /**
+ * @deprecated Use {@link org.apache.crunch.Aggregator}
+ */
+ public static interface Aggregator<T> extends Serializable {
+ /**
+ * Perform any setup of this instance that is required prior to processing
+ * inputs.
+ */
+ void initialize(Configuration configuration);
+
+ /**
+ * Clears the internal state of this Aggregator and prepares it for the
+ * values associated with the next key.
+ */
+ void reset();
+
+ /**
+ * Incorporate the given value into the aggregate state maintained by this
+ * instance.
+ */
+ void update(T value);
+
+ /**
+ * Returns the current aggregated state of this instance.
+ */
+ Iterable<T> results();
+ }
+
+ /**
+ * Base class for aggregators that do not require any initialization.
+ *
+ * @deprecated Use {@link org.apache.crunch.fn.Aggregators.SimpleAggregator}
+ */
+ public static abstract class SimpleAggregator<T> implements Aggregator<T> {
+ @Override
+ public void initialize(Configuration conf) {
+ // No-op
+ }
+ }
+
+ /**
+ * Interface for constructing new aggregator instances.
+ *
+ * @deprecated Use {@link PGroupedTable#combineValues(Aggregator)} which doesn't require a factory.
+ */
+ public static interface AggregatorFactory<T> {
+ Aggregator<T> create();
+ }
+
+ /**
+ * A {@code CombineFn} that delegates all of the actual work to an
+ * {@code Aggregator} instance.
+ *
+ * @deprecated Use the {@link Aggregators#toCombineFn(org.apache.crunch.Aggregator)} adapter
+ */
+ public static class AggregatorCombineFn<K, V> extends CombineFn<K, V> {
+
+ private final Aggregator<V> aggregator;
+
+ public AggregatorCombineFn(Aggregator<V> aggregator) {
+ this.aggregator = aggregator;
+ }
+
+ @Override
+ public void initialize() {
+ aggregator.initialize(getConfiguration());
+ }
+
+ @Override
+ public void process(Pair<K, Iterable<V>> input, Emitter<Pair<K, V>> emitter) {
+ aggregator.reset();
+ for (V v : input.second()) {
+ aggregator.update(v);
+ }
+ for (V v : aggregator.results()) {
+ emitter.emit(Pair.of(input.first(), v));
+ }
+ }
+ }
+
+ private static abstract class TupleAggregator<T> implements Aggregator<T> {
+ private final List<Aggregator<Object>> aggregators;
+
+ public TupleAggregator(Aggregator<?>... aggregators) {
+ this.aggregators = Lists.newArrayList();
+ for (Aggregator<?> a : aggregators) {
+ this.aggregators.add((Aggregator<Object>) a);
+ }
+ }
+
+ @Override
+ public void initialize(Configuration configuration) {
+ for (Aggregator<?> a : aggregators) {
+ a.initialize(configuration);
+ }
+ }
+
+ @Override
+ public void reset() {
+ for (Aggregator<?> a : aggregators) {
+ a.reset();
+ }
+ }
+
+ protected void updateTuple(Tuple t) {
+ for (int i = 0; i < aggregators.size(); i++) {
+ aggregators.get(i).update(t.get(i));
+ }
+ }
+
+ protected Iterable<Object> results(int index) {
+ return aggregators.get(index).results();
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#pairAggregator(Aggregator, Aggregator)}
+ */
+ public static class PairAggregator<V1, V2> extends TupleAggregator<Pair<V1, V2>> {
+
+ public PairAggregator(Aggregator<V1> a1, Aggregator<V2> a2) {
+ super(a1, a2);
+ }
+
+ @Override
+ public void update(Pair<V1, V2> value) {
+ updateTuple(value);
+ }
+
+ @Override
+ public Iterable<Pair<V1, V2>> results() {
+ return new Tuples.PairIterable<V1, V2>((Iterable<V1>) results(0), (Iterable<V2>) results(1));
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#tripAggregator(Aggregator, Aggregator, Aggregator)}
+ */
+ public static class TripAggregator<A, B, C> extends TupleAggregator<Tuple3<A, B, C>> {
+
+ public TripAggregator(Aggregator<A> a1, Aggregator<B> a2, Aggregator<C> a3) {
+ super(a1, a2, a3);
+ }
+
+ @Override
+ public void update(Tuple3<A, B, C> value) {
+ updateTuple(value);
+ }
+
+ @Override
+ public Iterable<Tuple3<A, B, C>> results() {
+ return new Tuples.TripIterable<A, B, C>((Iterable<A>) results(0), (Iterable<B>) results(1),
+ (Iterable<C>) results(2));
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#quadAggregator(Aggregator, Aggregator, Aggregator, Aggregator)}
+ */
+ public static class QuadAggregator<A, B, C, D> extends TupleAggregator<Tuple4<A, B, C, D>> {
+
+ public QuadAggregator(Aggregator<A> a1, Aggregator<B> a2, Aggregator<C> a3, Aggregator<D> a4) {
+ super(a1, a2, a3, a4);
+ }
+
+ @Override
+ public void update(Tuple4<A, B, C, D> value) {
+ updateTuple(value);
+ }
+
+ @Override
+ public Iterable<Tuple4<A, B, C, D>> results() {
+ return new Tuples.QuadIterable<A, B, C, D>((Iterable<A>) results(0), (Iterable<B>) results(1),
+ (Iterable<C>) results(2), (Iterable<D>) results(3));
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#tupleAggregator(Aggregator...)}
+ */
+ public static class TupleNAggregator extends TupleAggregator<TupleN> {
+
+ private final int size;
+
+ public TupleNAggregator(Aggregator<?>... aggregators) {
+ super(aggregators);
+ size = aggregators.length;
+ }
+
+ @Override
+ public void update(TupleN value) {
+ updateTuple(value);
+ }
+
+ @Override
+ public Iterable<TupleN> results() {
+ Iterable<?>[] iterables = new Iterable[size];
+ for (int i = 0; i < size; i++) {
+ iterables[i] = results(i);
+ }
+ return new Tuples.TupleNIterable(iterables);
+ }
+
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#toCombineFn(Aggregator)}
+ */
+ public static final <K, V> CombineFn<K, V> aggregator(Aggregator<V> aggregator) {
+ return new AggregatorCombineFn<K, V>(aggregator);
+ }
+
+ /**
+ * @deprecated Use {@link PGroupedTable#combineValues(Aggregator)} which doesn't require a factory.
+ */
+ public static final <K, V> CombineFn<K, V> aggregatorFactory(AggregatorFactory<V> aggregator) {
+ return new AggregatorCombineFn<K, V>(aggregator.create());
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#pairAggregator(Aggregator, Aggregator)}
+ */
+ public static final <K, V1, V2> CombineFn<K, Pair<V1, V2>> pairAggregator(AggregatorFactory<V1> a1,
+ AggregatorFactory<V2> a2) {
+ return aggregator(new PairAggregator<V1, V2>(a1.create(), a2.create()));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#tripAggregator(Aggregator, Aggregator, Aggregator)}
+ */
+ public static final <K, A, B, C> CombineFn<K, Tuple3<A, B, C>> tripAggregator(AggregatorFactory<A> a1,
+ AggregatorFactory<B> a2, AggregatorFactory<C> a3) {
+ return aggregator(new TripAggregator<A, B, C>(a1.create(), a2.create(), a3.create()));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#quadAggregator(Aggregator, Aggregator, Aggregator, Aggregator)}
+ */
+ public static final <K, A, B, C, D> CombineFn<K, Tuple4<A, B, C, D>> quadAggregator(AggregatorFactory<A> a1,
+ AggregatorFactory<B> a2, AggregatorFactory<C> a3, AggregatorFactory<D> a4) {
+ return aggregator(new QuadAggregator<A, B, C, D>(a1.create(), a2.create(), a3.create(), a4.create()));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#tupleAggregator(Aggregator...)}
+ */
+ public static final <K> CombineFn<K, TupleN> tupleAggregator(AggregatorFactory<?>... factories) {
+ Aggregator<?>[] aggs = new Aggregator[factories.length];
+ for (int i = 0; i < aggs.length; i++) {
+ aggs[i] = factories[i].create();
+ }
+ return aggregator(new TupleNAggregator(aggs));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_LONGS()}
+ */
+ public static final <K> CombineFn<K, Long> SUM_LONGS() {
+ return aggregatorFactory(SUM_LONGS);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_INTS()}
+ */
+ public static final <K> CombineFn<K, Integer> SUM_INTS() {
+ return aggregatorFactory(SUM_INTS);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_FLOATS()}
+ */
+ public static final <K> CombineFn<K, Float> SUM_FLOATS() {
+ return aggregatorFactory(SUM_FLOATS);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_DOUBLES()}
+ */
+ public static final <K> CombineFn<K, Double> SUM_DOUBLES() {
+ return aggregatorFactory(SUM_DOUBLES);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_BIGINTS()}
+ */
+ public static final <K> CombineFn<K, BigInteger> SUM_BIGINTS() {
+ return aggregatorFactory(SUM_BIGINTS);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_LONGS()}
+ */
+ public static final <K> CombineFn<K, Long> MAX_LONGS() {
+ return aggregatorFactory(MAX_LONGS);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_LONGS(int)}
+ */
+ public static final <K> CombineFn<K, Long> MAX_LONGS(int n) {
+ return aggregator(new MaxNAggregator<Long>(n));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_INTS()}
+ */
+ public static final <K> CombineFn<K, Integer> MAX_INTS() {
+ return aggregatorFactory(MAX_INTS);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_INTS(int)}
+ */
+ public static final <K> CombineFn<K, Integer> MAX_INTS(int n) {
+ return aggregator(new MaxNAggregator<Integer>(n));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_FLOATS()}
+ */
+ public static final <K> CombineFn<K, Float> MAX_FLOATS() {
+ return aggregatorFactory(MAX_FLOATS);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_FLOATS(int)}
+ */
+ public static final <K> CombineFn<K, Float> MAX_FLOATS(int n) {
+ return aggregator(new MaxNAggregator<Float>(n));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_DOUBLES()}
+ */
+ public static final <K> CombineFn<K, Double> MAX_DOUBLES() {
+ return aggregatorFactory(MAX_DOUBLES);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_DOUBLES(int)}
+ */
+ public static final <K> CombineFn<K, Double> MAX_DOUBLES(int n) {
+ return aggregator(new MaxNAggregator<Double>(n));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_BIGINTS()}
+ */
+ public static final <K> CombineFn<K, BigInteger> MAX_BIGINTS() {
+ return aggregatorFactory(MAX_BIGINTS);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_BIGINTS(int)}
+ */
+ public static final <K> CombineFn<K, BigInteger> MAX_BIGINTS(int n) {
+ return aggregator(new MaxNAggregator<BigInteger>(n));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_LONGS()}
+ */
+ public static final <K> CombineFn<K, Long> MIN_LONGS() {
+ return aggregatorFactory(MIN_LONGS);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_LONGS(int)}
+ */
+ public static final <K> CombineFn<K, Long> MIN_LONGS(int n) {
+ return aggregator(new MinNAggregator<Long>(n));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_INTS()}
+ */
+ public static final <K> CombineFn<K, Integer> MIN_INTS() {
+ return aggregatorFactory(MIN_INTS);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_INTS(int)}
+ */
+ public static final <K> CombineFn<K, Integer> MIN_INTS(int n) {
+ return aggregator(new MinNAggregator<Integer>(n));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_FLOATS()}
+ */
+ public static final <K> CombineFn<K, Float> MIN_FLOATS() {
+ return aggregatorFactory(MIN_FLOATS);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_FLOATS(int)}
+ */
+ public static final <K> CombineFn<K, Float> MIN_FLOATS(int n) {
+ return aggregator(new MinNAggregator<Float>(n));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_DOUBLES()}
+ */
+ public static final <K> CombineFn<K, Double> MIN_DOUBLES() {
+ return aggregatorFactory(MIN_DOUBLES);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_DOUBLES(int)}
+ */
+ public static final <K> CombineFn<K, Double> MIN_DOUBLES(int n) {
+ return aggregator(new MinNAggregator<Double>(n));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_BIGINTS()}
+ */
+ public static final <K> CombineFn<K, BigInteger> MIN_BIGINTS() {
+ return aggregatorFactory(MIN_BIGINTS);
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_BIGINTS(int)}
+ */
+ public static final <K> CombineFn<K, BigInteger> MIN_BIGINTS(int n) {
+ return aggregator(new MinNAggregator<BigInteger>(n));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#FIRST_N(int)}
+ */
+ public static final <K, V> CombineFn<K, V> FIRST_N(int n) {
+ return aggregator(new FirstNAggregator<V>(n));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#LAST_N(int)}
+ */
+ public static final <K, V> CombineFn<K, V> LAST_N(int n) {
+ return aggregator(new LastNAggregator<V>(n));
+ }
+
+ /**
+ * Used to concatenate strings, with a separator between each strings. There
+ * is no limits of length for the concatenated string.
+ *
+ * @param separator
+ * the separator which will be appended between each string
+ * @param skipNull
+ * define if we should skip null values. Throw
+ * NullPointerException if set to false and there is a null
+ * value.
+ * @return
+ *
+ * @deprecated Use {@link Aggregators#STRING_CONCAT(String, boolean)}
+ */
+ public static final <K> CombineFn<K, String> STRING_CONCAT(final String separator, final boolean skipNull) {
+ return aggregator(new StringConcatAggregator(separator, skipNull));
+ }
+
+ /**
+ * Used to concatenate strings, with a separator between each strings. You
+ * can specify the maximum length of the output string and of the input
+ * strings, if they are > 0. If a value is <= 0, there is no limits.
+ *
+ * Any too large string (or any string which would made the output too
+ * large) will be silently discarded.
+ *
+ * @param separator
+ * the separator which will be appended between each string
+ * @param skipNull
+ * define if we should skip null values. Throw
+ * NullPointerException if set to false and there is a null
+ * value.
+ * @param maxOutputLength
+ * the maximum length of the output string. If it's set <= 0,
+ * there is no limits. The number of characters of the output
+ * string will be < maxOutputLength.
+ * @param maxInputLength
+ * the maximum length of the input strings. If it's set <= 0,
+ * there is no limits. The number of characters of the int string
+ * will be < maxInputLength to be concatenated.
+ * @return
+ *
+ * @deprecated Use {@link Aggregators#STRING_CONCAT(String, boolean, long, long)}
+ */
+ public static final <K> CombineFn<K, String> STRING_CONCAT(final String separator, final boolean skipNull, final long maxOutputLength, final long maxInputLength) {
+ return aggregator(new StringConcatAggregator(separator, skipNull, maxOutputLength, maxInputLength));
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_LONGS()}
+ */
+ public static class SumLongs extends SimpleAggregator<Long> {
+ private long sum = 0;
+
+ @Override
+ public void reset() {
+ sum = 0;
+ }
+
+ @Override
+ public void update(Long next) {
+ sum += next;
+ }
+
+ @Override
+ public Iterable<Long> results() {
+ return ImmutableList.of(sum);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_LONGS()}
+ */
+ public static AggregatorFactory<Long> SUM_LONGS = new AggregatorFactory<Long>() {
+ public Aggregator<Long> create() {
+ return new SumLongs();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_INTS()}
+ */
+ public static class SumInts extends SimpleAggregator<Integer> {
+ private int sum = 0;
+
+ @Override
+ public void reset() {
+ sum = 0;
+ }
+
+ @Override
+ public void update(Integer next) {
+ sum += next;
+ }
+
+ @Override
+ public Iterable<Integer> results() {
+ return ImmutableList.of(sum);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_INTS()}
+ */
+ public static AggregatorFactory<Integer> SUM_INTS = new AggregatorFactory<Integer>() {
+ public Aggregator<Integer> create() {
+ return new SumInts();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_FLOATS()}
+ */
+ public static class SumFloats extends SimpleAggregator<Float> {
+ private float sum = 0;
+
+ @Override
+ public void reset() {
+ sum = 0f;
+ }
+
+ @Override
+ public void update(Float next) {
+ sum += next;
+ }
+
+ @Override
+ public Iterable<Float> results() {
+ return ImmutableList.of(sum);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_FLOATS()}
+ */
+ public static AggregatorFactory<Float> SUM_FLOATS = new AggregatorFactory<Float>() {
+ public Aggregator<Float> create() {
+ return new SumFloats();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_DOUBLES()}
+ */
+ public static class SumDoubles extends SimpleAggregator<Double> {
+ private double sum = 0;
+
+ @Override
+ public void reset() {
+ sum = 0f;
+ }
+
+ @Override
+ public void update(Double next) {
+ sum += next;
+ }
+
+ @Override
+ public Iterable<Double> results() {
+ return ImmutableList.of(sum);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_DOUBLES()}
+ */
+ public static AggregatorFactory<Double> SUM_DOUBLES = new AggregatorFactory<Double>() {
+ public Aggregator<Double> create() {
+ return new SumDoubles();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_BIGINTS()}
+ */
+ public static class SumBigInts extends SimpleAggregator<BigInteger> {
+ private BigInteger sum = BigInteger.ZERO;
+
+ @Override
+ public void reset() {
+ sum = BigInteger.ZERO;
+ }
+
+ @Override
+ public void update(BigInteger next) {
+ sum = sum.add(next);
+ }
+
+ @Override
+ public Iterable<BigInteger> results() {
+ return ImmutableList.of(sum);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#SUM_BIGINTS()}
+ */
+ public static AggregatorFactory<BigInteger> SUM_BIGINTS = new AggregatorFactory<BigInteger>() {
+ public Aggregator<BigInteger> create() {
+ return new SumBigInts();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_LONGS()}
+ */
+ public static class MaxLongs extends SimpleAggregator<Long> {
+ private Long max = null;
+
+ @Override
+ public void reset() {
+ max = null;
+ }
+
+ @Override
+ public void update(Long next) {
+ if (max == null || max < next) {
+ max = next;
+ }
+ }
+
+ @Override
+ public Iterable<Long> results() {
+ return ImmutableList.of(max);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_LONGS()}
+ */
+ public static AggregatorFactory<Long> MAX_LONGS = new AggregatorFactory<Long>() {
+ public Aggregator<Long> create() {
+ return new MaxLongs();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_INTS()}
+ */
+ public static class MaxInts extends SimpleAggregator<Integer> {
+ private Integer max = null;
+
+ @Override
+ public void reset() {
+ max = null;
+ }
+
+ @Override
+ public void update(Integer next) {
+ if (max == null || max < next) {
+ max = next;
+ }
+ }
+
+ @Override
+ public Iterable<Integer> results() {
+ return ImmutableList.of(max);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_INTS()}
+ */
+ public static AggregatorFactory<Integer> MAX_INTS = new AggregatorFactory<Integer>() {
+ public Aggregator<Integer> create() {
+ return new MaxInts();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_FLOATS()}
+ */
+ public static class MaxFloats extends SimpleAggregator<Float> {
+ private Float max = null;
+
+ @Override
+ public void reset() {
+ max = null;
+ }
+
+ @Override
+ public void update(Float next) {
+ if (max == null || max < next) {
+ max = next;
+ }
+ }
+
+ @Override
+ public Iterable<Float> results() {
+ return ImmutableList.of(max);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_FLOATS()}
+ */
+ public static AggregatorFactory<Float> MAX_FLOATS = new AggregatorFactory<Float>() {
+ public Aggregator<Float> create() {
+ return new MaxFloats();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_DOUBLES()}
+ */
+ public static class MaxDoubles extends SimpleAggregator<Double> {
+ private Double max = null;
+
+ @Override
+ public void reset() {
+ max = null;
+ }
+
+ @Override
+ public void update(Double next) {
+ if (max == null || max < next) {
+ max = next;
+ }
+ }
+
+ @Override
+ public Iterable<Double> results() {
+ return ImmutableList.of(max);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_DOUBLES()}
+ */
+ public static AggregatorFactory<Double> MAX_DOUBLES = new AggregatorFactory<Double>() {
+ public Aggregator<Double> create() {
+ return new MaxDoubles();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_BIGINTS()}
+ */
+ public static class MaxBigInts extends SimpleAggregator<BigInteger> {
+ private BigInteger max = null;
+
+ @Override
+ public void reset() {
+ max = null;
+ }
+
+ @Override
+ public void update(BigInteger next) {
+ if (max == null || max.compareTo(next) < 0) {
+ max = next;
+ }
+ }
+
+ @Override
+ public Iterable<BigInteger> results() {
+ return ImmutableList.of(max);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_BIGINTS()}
+ */
+ public static AggregatorFactory<BigInteger> MAX_BIGINTS = new AggregatorFactory<BigInteger>() {
+ public Aggregator<BigInteger> create() {
+ return new MaxBigInts();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_LONGS()}
+ */
+ public static class MinLongs extends SimpleAggregator<Long> {
+ private Long min = null;
+
+ @Override
+ public void reset() {
+ min = null;
+ }
+
+ @Override
+ public void update(Long next) {
+ if (min == null || min > next) {
+ min = next;
+ }
+ }
+
+ @Override
+ public Iterable<Long> results() {
+ return ImmutableList.of(min);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_LONGS()}
+ */
+ public static AggregatorFactory<Long> MIN_LONGS = new AggregatorFactory<Long>() {
+ public Aggregator<Long> create() {
+ return new MinLongs();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_INTS()}
+ */
+ public static class MinInts extends SimpleAggregator<Integer> {
+ private Integer min = null;
+
+ @Override
+ public void reset() {
+ min = null;
+ }
+
+ @Override
+ public void update(Integer next) {
+ if (min == null || min > next) {
+ min = next;
+ }
+ }
+
+ @Override
+ public Iterable<Integer> results() {
+ return ImmutableList.of(min);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_INTS()}
+ */
+ public static AggregatorFactory<Integer> MIN_INTS = new AggregatorFactory<Integer>() {
+ public Aggregator<Integer> create() {
+ return new MinInts();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_FLOATS()}
+ */
+ public static class MinFloats extends SimpleAggregator<Float> {
+ private Float min = null;
+
+ @Override
+ public void reset() {
+ min = null;
+ }
+
+ @Override
+ public void update(Float next) {
+ if (min == null || min > next) {
+ min = next;
+ }
+ }
+
+ @Override
+ public Iterable<Float> results() {
+ return ImmutableList.of(min);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_FLOATS()}
+ */
+ public static AggregatorFactory<Float> MIN_FLOATS = new AggregatorFactory<Float>() {
+ public Aggregator<Float> create() {
+ return new MinFloats();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_DOUBLES()}
+ */
+ public static class MinDoubles extends SimpleAggregator<Double> {
+ private Double min = null;
+
+ @Override
+ public void reset() {
+ min = null;
+ }
+
+ @Override
+ public void update(Double next) {
+ if (min == null || min > next) {
+ min = next;
+ }
+ }
+
+ @Override
+ public Iterable<Double> results() {
+ return ImmutableList.of(min);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_DOUBLES()}
+ */
+ public static AggregatorFactory<Double> MIN_DOUBLES = new AggregatorFactory<Double>() {
+ public Aggregator<Double> create() {
+ return new MinDoubles();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_BIGINTS()}
+ */
+ public static class MinBigInts extends SimpleAggregator<BigInteger> {
+ private BigInteger min = null;
+
+ @Override
+ public void reset() {
+ min = null;
+ }
+
+ @Override
+ public void update(BigInteger next) {
+ if (min == null || min.compareTo(next) > 0) {
+ min = next;
+ }
+ }
+
+ @Override
+ public Iterable<BigInteger> results() {
+ return ImmutableList.of(min);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_BIGINTS()}
+ */
+ public static AggregatorFactory<BigInteger> MIN_BIGINTS = new AggregatorFactory<BigInteger>() {
+ public Aggregator<BigInteger> create() {
+ return new MinBigInts();
+ }
+ };
+
+ /**
+ * @deprecated Use {@link Aggregators#MAX_N(int, Class)}
+ */
+ public static class MaxNAggregator<V extends Comparable<V>> extends SimpleAggregator<V> {
+ private final int arity;
+ private transient SortedSet<V> elements;
+
+ public MaxNAggregator(int arity) {
+ this.arity = arity;
+ }
+
+ @Override
+ public void reset() {
+ if (elements == null) {
+ elements = Sets.newTreeSet();
+ } else {
+ elements.clear();
+ }
+ }
+
+ @Override
+ public void update(V value) {
+ if (elements.size() < arity) {
+ elements.add(value);
+ } else if (value.compareTo(elements.first()) > 0) {
+ elements.remove(elements.first());
+ elements.add(value);
+ }
+ }
+
+ @Override
+ public Iterable<V> results() {
+ return ImmutableList.copyOf(elements);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#MIN_N(int, Class)}
+ */
+ public static class MinNAggregator<V extends Comparable<V>> extends SimpleAggregator<V> {
+ private final int arity;
+ private transient SortedSet<V> elements;
+
+ public MinNAggregator(int arity) {
+ this.arity = arity;
+ }
+
+ @Override
+ public void reset() {
+ if (elements == null) {
+ elements = Sets.newTreeSet();
+ } else {
+ elements.clear();
+ }
+ }
+
+ @Override
+ public void update(V value) {
+ if (elements.size() < arity) {
+ elements.add(value);
+ } else if (value.compareTo(elements.last()) < 0) {
+ elements.remove(elements.last());
+ elements.add(value);
+ }
+ }
+
+ @Override
+ public Iterable<V> results() {
+ return ImmutableList.copyOf(elements);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#FIRST_N(int)}
+ */
+ public static class FirstNAggregator<V> extends SimpleAggregator<V> {
+ private final int arity;
+ private final List<V> elements;
+
+ public FirstNAggregator(int arity) {
+ this.arity = arity;
+ this.elements = Lists.newArrayList();
+ }
+
+ @Override
+ public void reset() {
+ elements.clear();
+ }
+
+ @Override
+ public void update(V value) {
+ if (elements.size() < arity) {
+ elements.add(value);
+ }
+ }
+
+ @Override
+ public Iterable<V> results() {
+ return ImmutableList.copyOf(elements);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#LAST_N(int)}
+ */
+ public static class LastNAggregator<V> extends SimpleAggregator<V> {
+ private final int arity;
+ private final LinkedList<V> elements;
+
+ public LastNAggregator(int arity) {
+ this.arity = arity;
+ this.elements = Lists.newLinkedList();
+ }
+
+ @Override
+ public void reset() {
+ elements.clear();
+ }
+
+ @Override
+ public void update(V value) {
+ elements.add(value);
+ if (elements.size() == arity + 1) {
+ elements.removeFirst();
+ }
+ }
+
+ @Override
+ public Iterable<V> results() {
+ return ImmutableList.copyOf(elements);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link Aggregators#STRING_CONCAT(String, boolean, long, long)}
+ */
+ public static class StringConcatAggregator extends SimpleAggregator<String> {
+ private final String separator;
+ private final boolean skipNulls;
+ private final long maxOutputLength;
+ private final long maxInputLength;
+ private long currentLength;
+ private final LinkedList<String> list = new LinkedList<String>();
+
+ private transient Joiner joiner;
+
+ public StringConcatAggregator(final String separator, final boolean skipNulls) {
+ this.separator = separator;
+ this.skipNulls = skipNulls;
+ this.maxInputLength = 0;
+ this.maxOutputLength = 0;
+ }
+
+ public StringConcatAggregator(final String separator, final boolean skipNull, final long maxOutputLength, final long maxInputLength) {
+ this.separator = separator;
+ this.skipNulls = skipNull;
+ this.maxOutputLength = maxOutputLength;
+ this.maxInputLength = maxInputLength;
+ this.currentLength = -separator.length();
+ }
+
+ @Override
+ public void reset() {
+ if (joiner == null) {
+ joiner = skipNulls ? Joiner.on(separator).skipNulls() : Joiner.on(separator);
+ }
+ currentLength = -separator.length();
+ list.clear();
+ }
+
+ @Override
+ public void update(final String next) {
+ long length = (next == null) ? 0 : next.length() + separator.length();
+ if (maxOutputLength > 0 && currentLength + length > maxOutputLength || maxInputLength > 0 && next.length() > maxInputLength) {
+ return;
+ }
+ if (maxOutputLength > 0) {
+ currentLength += length;
+ }
+ list.add(next);
+ }
+
+ @Override
+ public Iterable<String> results() {
+ return ImmutableList.of(joiner.join(list));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/CrunchRuntimeException.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/CrunchRuntimeException.java b/crunch-core/src/main/java/org/apache/crunch/CrunchRuntimeException.java
new file mode 100644
index 0000000..044f600
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/CrunchRuntimeException.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+/**
+ * A {@code RuntimeException} implementation that includes some additional options
+ * for the Crunch execution engine to track reporting status. Clients may
+ * use instances of this class in their own {@code DoFn} implementations.
+ */
+public class CrunchRuntimeException extends RuntimeException {
+
+ private boolean logged = false;
+
+ public CrunchRuntimeException(String msg) {
+ super(msg);
+ }
+
+ public CrunchRuntimeException(Exception e) {
+ super(e);
+ }
+
+ public CrunchRuntimeException(String msg, Exception e) {
+ super(msg, e);
+ }
+
+ /**
+ * Returns true if this exception was written to the debug logs.
+ */
+ public boolean wasLogged() {
+ return logged;
+ }
+
+ /**
+ * Indicate that this exception has been written to the debug logs.
+ */
+ public void markLogged() {
+ this.logged = true;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/DoFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/DoFn.java b/crunch-core/src/main/java/org/apache/crunch/DoFn.java
new file mode 100644
index 0000000..2c6389a
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/DoFn.java
@@ -0,0 +1,162 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import java.io.Serializable;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.Counter;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+
+/**
+ * Base class for all data processing functions in Crunch.
+ *
+ * <p>
+ * Note that all {@code DoFn} instances implement {@link Serializable}, and thus
+ * all of their non-transient member variables must implement
+ * {@code Serializable} as well. If your DoFn depends on non-serializable
+ * classes for data processing, they may be declared as {@code transient} and
+ * initialized in the DoFn's {@code initialize} method.
+ *
+ */
+public abstract class DoFn<S, T> implements Serializable {
+ private transient TaskInputOutputContext<?, ?, ?, ?> context;
+
+ /**
+ * Configure this DoFn. Subclasses may override this method to modify the
+ * configuration of the Job that this DoFn instance belongs to.
+ *
+ * <p>
+ * Called during the job planning phase by the crunch-client.
+ * </p>
+ *
+ * @param conf
+ * The Configuration instance for the Job.
+ */
+ public void configure(Configuration conf) {
+ }
+
+ /**
+ * Initialize this DoFn. This initialization will happen before the actual
+ * {@link #process(Object, Emitter)} is triggered. Subclasses may override
+ * this method to do appropriate initialization.
+ *
+ * <p>
+ * Called during the setup of the job instance this {@code DoFn} is associated
+ * with.
+ * </p>
+ *
+ */
+ public void initialize() {
+ }
+
+ /**
+ * Processes the records from a {@link PCollection}.
+ *
+ * <br/>
+ * <br/>
+ * <b>Note:</b> Crunch can reuse a single input record object whose content
+ * changes on each {@link #process(Object, Emitter)} method call. This
+ * functionality is imposed by Hadoop's <a href=
+ * "http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/mapred/Reducer.html"
+ * >Reducer</a> implementation: <i>The framework will reuse the key and value
+ * objects that are passed into the reduce, therefore the application should
+ * clone the objects they want to keep a copy of.</i>
+ *
+ * @param input
+ * The input record.
+ * @param emitter
+ * The emitter to send the output to
+ */
+ public abstract void process(S input, Emitter<T> emitter);
+
+ /**
+ * Called during the cleanup of the MapReduce job this {@code DoFn} is
+ * associated with. Subclasses may override this method to do appropriate
+ * cleanup.
+ *
+ * @param emitter
+ * The emitter that was used for output
+ */
+ public void cleanup(Emitter<T> emitter) {
+ }
+
+ /**
+ * Called during setup to pass the {@link TaskInputOutputContext} to this
+ * {@code DoFn} instance.
+ */
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ this.context = context;
+ }
+
+ /**
+ * Returns an estimate of how applying this function to a {@link PCollection}
+ * will cause it to change in side. The optimizer uses these estimates to
+ * decide where to break up dependent MR jobs into separate Map and Reduce
+ * phases in order to minimize I/O.
+ *
+ * <p>
+ * Subclasses of {@code DoFn} that will substantially alter the size of the
+ * resulting {@code PCollection} should override this method.
+ */
+ public float scaleFactor() {
+ return 1.2f;
+ }
+
+ protected TaskInputOutputContext<?, ?, ?, ?> getContext() {
+ return context;
+ }
+
+ protected Configuration getConfiguration() {
+ return context.getConfiguration();
+ }
+
+ protected Counter getCounter(Enum<?> counterName) {
+ return context.getCounter(counterName);
+ }
+
+ protected Counter getCounter(String groupName, String counterName) {
+ return context.getCounter(groupName, counterName);
+ }
+
+ protected void increment(Enum<?> counterName) {
+ increment(counterName, 1);
+ }
+
+ protected void increment(Enum<?> counterName, long value) {
+ getCounter(counterName).increment(value);
+ }
+
+ protected void progress() {
+ context.progress();
+ }
+
+ protected TaskAttemptID getTaskAttemptID() {
+ return context.getTaskAttemptID();
+ }
+
+ protected void setStatus(String status) {
+ context.setStatus(status);
+ }
+
+ protected String getStatus() {
+ return context.getStatus();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/Emitter.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/Emitter.java b/crunch-core/src/main/java/org/apache/crunch/Emitter.java
new file mode 100644
index 0000000..d104a09
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/Emitter.java
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+/**
+ * Interface for writing outputs from a {@link DoFn}.
+ *
+ */
+public interface Emitter<T> {
+ /**
+ * Write the emitted value to the next stage of the pipeline.
+ *
+ * @param emitted
+ * The value to write
+ */
+ void emit(T emitted);
+
+ /**
+ * Flushes any values cached by this emitter. Called during the cleanup stage.
+ */
+ void flush();
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/FilterFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/FilterFn.java b/crunch-core/src/main/java/org/apache/crunch/FilterFn.java
new file mode 100644
index 0000000..440f122
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/FilterFn.java
@@ -0,0 +1,244 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import java.util.List;
+
+import org.apache.crunch.fn.FilterFns;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+
+import com.google.common.collect.ImmutableList;
+
+/**
+ * A {@link DoFn} for the common case of filtering the members of a
+ * {@link PCollection} based on a boolean condition.
+ */
+public abstract class FilterFn<T> extends DoFn<T, T> {
+
+ /**
+ * If true, emit the given record.
+ */
+ public abstract boolean accept(T input);
+
+ @Override
+ public void process(T input, Emitter<T> emitter) {
+ if (accept(input)) {
+ emitter.emit(input);
+ }
+ }
+
+ @Override
+ public final void cleanup(Emitter<T> emitter) {
+ cleanup();
+ }
+
+ /**
+ * Called during the cleanup of the MapReduce job this {@code FilterFn} is
+ * associated with. Subclasses may override this method to do appropriate
+ * cleanup.
+ */
+ public void cleanup() {
+ }
+
+ @Override
+ public float scaleFactor() {
+ return 0.5f;
+ }
+
+ /**
+ * @deprecated Use {@link FilterFns#and(FilterFn...)}
+ */
+ public static <S> FilterFn<S> and(FilterFn<S>... fns) {
+ return new AndFn<S>(fns);
+ }
+
+ /**
+ * @deprecated Use {@link FilterFns#and(FilterFn...)}
+ */
+ public static class AndFn<S> extends FilterFn<S> {
+
+ private final List<FilterFn<S>> fns;
+
+ public AndFn(FilterFn<S>... fns) {
+ this.fns = ImmutableList.<FilterFn<S>> copyOf(fns);
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ for (FilterFn<S> fn : fns) {
+ fn.configure(conf);
+ }
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ for (FilterFn<S> fn : fns) {
+ fn.setContext(context);
+ }
+ }
+
+ @Override
+ public void initialize() {
+ for (FilterFn<S> fn : fns) {
+ fn.initialize();
+ }
+ }
+
+ @Override
+ public void cleanup() {
+ for (FilterFn<S> fn : fns) {
+ fn.cleanup();
+ }
+ }
+
+ @Override
+ public boolean accept(S input) {
+ for (FilterFn<S> fn : fns) {
+ if (!fn.accept(input)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ @Override
+ public float scaleFactor() {
+ float scaleFactor = 1.0f;
+ for (FilterFn<S> fn : fns) {
+ scaleFactor *= fn.scaleFactor();
+ }
+ return scaleFactor;
+ }
+ }
+
+ /**
+ * @deprecated Use {@link FilterFns#or(FilterFn...)}
+ */
+ public static <S> FilterFn<S> or(FilterFn<S>... fns) {
+ return new OrFn<S>(fns);
+ }
+
+ /**
+ * @deprecated Use {@link FilterFns#or(FilterFn...)}
+ */
+ public static class OrFn<S> extends FilterFn<S> {
+
+ private final List<FilterFn<S>> fns;
+
+ public OrFn(FilterFn<S>... fns) {
+ this.fns = ImmutableList.<FilterFn<S>> copyOf(fns);
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ for (FilterFn<S> fn : fns) {
+ fn.configure(conf);
+ }
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ for (FilterFn<S> fn : fns) {
+ fn.setContext(context);
+ }
+ }
+
+ @Override
+ public void initialize() {
+ for (FilterFn<S> fn : fns) {
+ fn.initialize();
+ }
+ }
+
+ @Override
+ public void cleanup() {
+ for (FilterFn<S> fn : fns) {
+ fn.cleanup();
+ }
+ }
+
+ @Override
+ public boolean accept(S input) {
+ for (FilterFn<S> fn : fns) {
+ if (fn.accept(input)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public float scaleFactor() {
+ float scaleFactor = 0.0f;
+ for (FilterFn<S> fn : fns) {
+ scaleFactor += fn.scaleFactor();
+ }
+ return Math.min(1.0f, scaleFactor);
+ }
+ }
+
+ /**
+ * @deprecated Use {@link FilterFns#not(FilterFn)}
+ */
+ public static <S> FilterFn<S> not(FilterFn<S> fn) {
+ return new NotFn<S>(fn);
+ }
+
+ /**
+ * @deprecated Use {@link FilterFns#not(FilterFn)}
+ */
+ public static class NotFn<S> extends FilterFn<S> {
+
+ private final FilterFn<S> base;
+
+ public NotFn(FilterFn<S> base) {
+ this.base = base;
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ base.configure(conf);
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ base.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ base.initialize();
+ }
+
+ @Override
+ public void cleanup() {
+ base.cleanup();
+ }
+
+ @Override
+ public boolean accept(S input) {
+ return !base.accept(input);
+ }
+
+ @Override
+ public float scaleFactor() {
+ return 1.0f - base.scaleFactor();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/GroupingOptions.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/GroupingOptions.java b/crunch-core/src/main/java/org/apache/crunch/GroupingOptions.java
new file mode 100644
index 0000000..4aa1343
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/GroupingOptions.java
@@ -0,0 +1,167 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.hadoop.io.RawComparator;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Partitioner;
+
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+
+/**
+ * Options that can be passed to a {@code groupByKey} operation in order to
+ * exercise finer control over how the partitioning, grouping, and sorting of
+ * keys is performed.
+ *
+ */
+public class GroupingOptions {
+
+ private final Class<? extends Partitioner> partitionerClass;
+ private final Class<? extends RawComparator> groupingComparatorClass;
+ private final Class<? extends RawComparator> sortComparatorClass;
+ private final int numReducers;
+ private final Map<String, String> extraConf;
+ private final Set<SourceTarget<?>> sourceTargets;
+
+ private GroupingOptions(Class<? extends Partitioner> partitionerClass,
+ Class<? extends RawComparator> groupingComparatorClass, Class<? extends RawComparator> sortComparatorClass,
+ int numReducers, Map<String, String> extraConf, Set<SourceTarget<?>> sourceTargets) {
+ this.partitionerClass = partitionerClass;
+ this.groupingComparatorClass = groupingComparatorClass;
+ this.sortComparatorClass = sortComparatorClass;
+ this.numReducers = numReducers;
+ this.extraConf = extraConf;
+ this.sourceTargets = sourceTargets;
+ }
+
+ public int getNumReducers() {
+ return numReducers;
+ }
+
+ public Class<? extends RawComparator> getSortComparatorClass() {
+ return sortComparatorClass;
+ }
+
+ public Class<? extends RawComparator> getGroupingComparatorClass() {
+ return groupingComparatorClass;
+ }
+
+ public Class<? extends Partitioner> getPartitionerClass() {
+ return partitionerClass;
+ }
+
+ public Set<SourceTarget<?>> getSourceTargets() {
+ return sourceTargets;
+ }
+
+ public void configure(Job job) {
+ if (partitionerClass != null) {
+ job.setPartitionerClass(partitionerClass);
+ }
+ if (groupingComparatorClass != null) {
+ job.setGroupingComparatorClass(groupingComparatorClass);
+ }
+ if (sortComparatorClass != null) {
+ job.setSortComparatorClass(sortComparatorClass);
+ }
+ if (numReducers > 0) {
+ job.setNumReduceTasks(numReducers);
+ }
+ for (Map.Entry<String, String> e : extraConf.entrySet()) {
+ job.getConfiguration().set(e.getKey(), e.getValue());
+ }
+ }
+
+ public boolean isCompatibleWith(GroupingOptions other) {
+ if (partitionerClass != other.partitionerClass) {
+ return false;
+ }
+ if (groupingComparatorClass != other.groupingComparatorClass) {
+ return false;
+ }
+ if (sortComparatorClass != other.sortComparatorClass) {
+ return false;
+ }
+ if (!extraConf.equals(other.extraConf)) {
+ return false;
+ }
+ return true;
+ }
+
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ /**
+ * Builder class for creating {@code GroupingOptions} instances.
+ *
+ */
+ public static class Builder {
+ private Class<? extends Partitioner> partitionerClass;
+ private Class<? extends RawComparator> groupingComparatorClass;
+ private Class<? extends RawComparator> sortComparatorClass;
+ private int numReducers;
+ private Map<String, String> extraConf = Maps.newHashMap();
+ private Set<SourceTarget<?>> sourceTargets = Sets.newHashSet();
+
+ public Builder() {
+ }
+
+ public Builder partitionerClass(Class<? extends Partitioner> partitionerClass) {
+ this.partitionerClass = partitionerClass;
+ return this;
+ }
+
+ public Builder groupingComparatorClass(Class<? extends RawComparator> groupingComparatorClass) {
+ this.groupingComparatorClass = groupingComparatorClass;
+ return this;
+ }
+
+ public Builder sortComparatorClass(Class<? extends RawComparator> sortComparatorClass) {
+ this.sortComparatorClass = sortComparatorClass;
+ return this;
+ }
+
+ public Builder numReducers(int numReducers) {
+ if (numReducers <= 0) {
+ throw new IllegalArgumentException("Invalid number of reducers: " + numReducers);
+ }
+ this.numReducers = numReducers;
+ return this;
+ }
+
+ public Builder conf(String confKey, String confValue) {
+ this.extraConf.put(confKey, confValue);
+ return this;
+ }
+
+ public Builder sourceTarget(SourceTarget<?> st) {
+ this.sourceTargets.add(st);
+ return this;
+ }
+
+ public GroupingOptions build() {
+ return new GroupingOptions(partitionerClass, groupingComparatorClass, sortComparatorClass,
+ numReducers, extraConf, sourceTargets);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/MapFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/MapFn.java b/crunch-core/src/main/java/org/apache/crunch/MapFn.java
new file mode 100644
index 0000000..dbf172e
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/MapFn.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+/**
+ * A {@link DoFn} for the common case of emitting exactly one value for each
+ * input record.
+ *
+ */
+public abstract class MapFn<S, T> extends DoFn<S, T> {
+
+ /**
+ * Maps the given input into an instance of the output type.
+ */
+ public abstract T map(S input);
+
+ @Override
+ public void process(S input, Emitter<T> emitter) {
+ emitter.emit(map(input));
+ }
+
+ @Override
+ public float scaleFactor() {
+ return 1.0f;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/PCollection.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/PCollection.java b/crunch-core/src/main/java/org/apache/crunch/PCollection.java
new file mode 100644
index 0000000..6f5abf6
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/PCollection.java
@@ -0,0 +1,245 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import java.util.Collection;
+
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+
+/**
+ * A representation of an immutable, distributed collection of elements that is
+ * the fundamental target of computations in Crunch.
+ *
+ */
+public interface PCollection<S> {
+ /**
+ * Returns the {@code Pipeline} associated with this PCollection.
+ */
+ Pipeline getPipeline();
+
+ /**
+ * Returns a {@code PCollection} instance that acts as the union of this
+ * {@code PCollection} and the given {@code PCollection}.
+ */
+ PCollection<S> union(PCollection<S> other);
+
+ /**
+ * Returns a {@code PCollection} instance that acts as the union of this
+ * {@code PCollection} and the input {@code PCollection}s.
+ */
+ PCollection<S> union(PCollection<S>... collections);
+
+ /**
+ * Applies the given doFn to the elements of this {@code PCollection} and
+ * returns a new {@code PCollection} that is the output of this processing.
+ *
+ * @param doFn
+ * The {@code DoFn} to apply
+ * @param type
+ * The {@link PType} of the resulting {@code PCollection}
+ * @return a new {@code PCollection}
+ */
+ <T> PCollection<T> parallelDo(DoFn<S, T> doFn, PType<T> type);
+
+ /**
+ * Applies the given doFn to the elements of this {@code PCollection} and
+ * returns a new {@code PCollection} that is the output of this processing.
+ *
+ * @param name
+ * An identifier for this processing step, useful for debugging
+ * @param doFn
+ * The {@code DoFn} to apply
+ * @param type
+ * The {@link PType} of the resulting {@code PCollection}
+ * @return a new {@code PCollection}
+ */
+ <T> PCollection<T> parallelDo(String name, DoFn<S, T> doFn, PType<T> type);
+
+ /**
+ * Applies the given doFn to the elements of this {@code PCollection} and
+ * returns a new {@code PCollection} that is the output of this processing.
+ *
+ * @param name
+ * An identifier for this processing step, useful for debugging
+ * @param doFn
+ * The {@code DoFn} to apply
+ * @param type
+ * The {@link PType} of the resulting {@code PCollection}
+ * @param options
+ * Optional information that is needed for certain pipeline operations
+ * @return a new {@code PCollection}
+ */
+ <T> PCollection<T> parallelDo(String name, DoFn<S, T> doFn, PType<T> type,
+ ParallelDoOptions options);
+
+ /**
+ * Similar to the other {@code parallelDo} instance, but returns a
+ * {@code PTable} instance instead of a {@code PCollection}.
+ *
+ * @param doFn
+ * The {@code DoFn} to apply
+ * @param type
+ * The {@link PTableType} of the resulting {@code PTable}
+ * @return a new {@code PTable}
+ */
+ <K, V> PTable<K, V> parallelDo(DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type);
+
+ /**
+ * Similar to the other {@code parallelDo} instance, but returns a
+ * {@code PTable} instance instead of a {@code PCollection}.
+ *
+ * @param name
+ * An identifier for this processing step
+ * @param doFn
+ * The {@code DoFn} to apply
+ * @param type
+ * The {@link PTableType} of the resulting {@code PTable}
+ * @return a new {@code PTable}
+ */
+ <K, V> PTable<K, V> parallelDo(String name, DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type);
+
+ /**
+ * Similar to the other {@code parallelDo} instance, but returns a
+ * {@code PTable} instance instead of a {@code PCollection}.
+ *
+ * @param name
+ * An identifier for this processing step
+ * @param doFn
+ * The {@code DoFn} to apply
+ * @param type
+ * The {@link PTableType} of the resulting {@code PTable}
+ * @param options
+ * Optional information that is needed for certain pipeline operations
+ * @return a new {@code PTable}
+ */
+ <K, V> PTable<K, V> parallelDo(String name, DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type,
+ ParallelDoOptions options);
+
+ /**
+ * Write the contents of this {@code PCollection} to the given {@code Target},
+ * using the storage format specified by the target.
+ *
+ * @param target
+ * The target to write to
+ */
+ PCollection<S> write(Target target);
+
+ /**
+ * Write the contents of this {@code PCollection} to the given {@code Target},
+ * using the given {@code Target.WriteMode} to handle existing
+ * targets.
+ *
+ * @param target
+ * The target
+ * @param writeMode
+ * The rule for handling existing outputs at the target location
+ */
+ PCollection<S> write(Target target, Target.WriteMode writeMode);
+
+ /**
+ * Returns a reference to the data set represented by this PCollection that
+ * may be used by the client to read the data locally.
+ */
+ Iterable<S> materialize();
+
+ /**
+ * @return A {@code PObject} encapsulating an in-memory {@link Collection} containing the values
+ * of this {@code PCollection}.
+ */
+ PObject<Collection<S>> asCollection();
+
+ /**
+ * Returns the {@code PType} of this {@code PCollection}.
+ */
+ PType<S> getPType();
+
+ /**
+ * Returns the {@code PTypeFamily} of this {@code PCollection}.
+ */
+ PTypeFamily getTypeFamily();
+
+ /**
+ * Returns the size of the data represented by this {@code PCollection} in
+ * bytes.
+ */
+ long getSize();
+
+ /**
+ * Returns the number of elements represented by this {@code PCollection}.
+ *
+ * @return An {@code PObject} containing the number of elements in this {@code PCollection}.
+ */
+ PObject<Long> length();
+
+ /**
+ * Returns a shorthand name for this PCollection.
+ */
+ String getName();
+
+ /**
+ * Apply the given filter function to this instance and return the resulting
+ * {@code PCollection}.
+ */
+ PCollection<S> filter(FilterFn<S> filterFn);
+
+ /**
+ * Apply the given filter function to this instance and return the resulting
+ * {@code PCollection}.
+ *
+ * @param name
+ * An identifier for this processing step
+ * @param filterFn
+ * The {@code FilterFn} to apply
+ */
+ PCollection<S> filter(String name, FilterFn<S> filterFn);
+
+ /**
+ * Apply the given map function to each element of this instance in order to
+ * create a {@code PTable}.
+ */
+ <K> PTable<K, S> by(MapFn<S, K> extractKeyFn, PType<K> keyType);
+
+ /**
+ * Apply the given map function to each element of this instance in order to
+ * create a {@code PTable}.
+ *
+ * @param name
+ * An identifier for this processing step
+ * @param extractKeyFn
+ * The {@code MapFn} to apply
+ */
+ <K> PTable<K, S> by(String name, MapFn<S, K> extractKeyFn, PType<K> keyType);
+
+ /**
+ * Returns a {@code PTable} instance that contains the counts of each unique
+ * element of this PCollection.
+ */
+ PTable<S, Long> count();
+
+ /**
+ * Returns a {@code PObject} of the maximum element of this instance.
+ */
+ PObject<S> max();
+
+ /**
+ * Returns a {@code PObject} of the minimum element of this instance.
+ */
+ PObject<S> min();
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/PGroupedTable.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/PGroupedTable.java b/crunch-core/src/main/java/org/apache/crunch/PGroupedTable.java
new file mode 100644
index 0000000..d77ffdb
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/PGroupedTable.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import org.apache.crunch.Aggregator;
+
+/**
+ * The Crunch representation of a grouped {@link PTable}.
+ *
+ */
+public interface PGroupedTable<K, V> extends PCollection<Pair<K, Iterable<V>>> {
+
+ /**
+ * Combines the values of this grouping using the given {@code CombineFn}.
+ *
+ * @param combineFn
+ * The combiner function
+ * @return A {@code PTable} where each key has a single value
+ */
+ PTable<K, V> combineValues(CombineFn<K, V> combineFn);
+
+ /**
+ * Combine the values in each group using the given {@link Aggregator}.
+ *
+ * @param aggregator The function to use
+ * @return A {@link PTable} where each group key maps to an aggregated
+ * value. Group keys may be repeated if an aggregator returns
+ * more than one value.
+ */
+ PTable<K, V> combineValues(Aggregator<V> aggregator);
+
+ /**
+ * Convert this grouping back into a multimap.
+ *
+ * @return an ungrouped version of the data in this {@code PGroupedTable}.
+ */
+ PTable<K, V> ungroup();
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/PObject.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/PObject.java b/crunch-core/src/main/java/org/apache/crunch/PObject.java
new file mode 100644
index 0000000..897a01f
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/PObject.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+/**
+ * A {@code PObject} represents a singleton object value that results from a distributed
+ * computation. Computation producing the value is deferred until
+ * {@link org.apache.crunch.PObject#getValue()} is called.
+ *
+ * @param <T> The type of value encapsulated by this {@code PObject}.
+ */
+public interface PObject<T> {
+ /**
+ * Gets the value associated with this {@code PObject}. Calling this method will trigger
+ * whatever computation is necessary to obtain the value and block until that computation
+ * succeeds.
+ *
+ * @return The value associated with this {@code PObject}.
+ */
+ T getValue();
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/PTable.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/PTable.java b/crunch-core/src/main/java/org/apache/crunch/PTable.java
new file mode 100644
index 0000000..8df9853
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/PTable.java
@@ -0,0 +1,181 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import java.util.Collection;
+import java.util.Map;
+
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+
+/**
+ * A sub-interface of {@code PCollection} that represents an immutable,
+ * distributed multi-map of keys and values.
+ *
+ */
+public interface PTable<K, V> extends PCollection<Pair<K, V>> {
+
+ /**
+ Returns a {@code PTable} instance that acts as the union of this
+ * {@code PTable} and the other {@code PTable}s.
+ */
+ PTable<K, V> union(PTable<K, V> other);
+
+ /**
+ * Returns a {@code PTable} instance that acts as the union of this
+ * {@code PTable} and the input {@code PTable}s.
+ */
+ PTable<K, V> union(PTable<K, V>... others);
+
+ /**
+ * Performs a grouping operation on the keys of this table.
+ *
+ * @return a {@code PGroupedTable} instance that represents the grouping
+ */
+ PGroupedTable<K, V> groupByKey();
+
+ /**
+ * Performs a grouping operation on the keys of this table, using the given
+ * number of partitions.
+ *
+ * @param numPartitions
+ * The number of partitions for the data.
+ * @return a {@code PGroupedTable} instance that represents this grouping
+ */
+ PGroupedTable<K, V> groupByKey(int numPartitions);
+
+ /**
+ * Performs a grouping operation on the keys of this table, using the
+ * additional {@code GroupingOptions} to control how the grouping is executed.
+ *
+ * @param options
+ * The grouping options to use
+ * @return a {@code PGroupedTable} instance that represents the grouping
+ */
+ PGroupedTable<K, V> groupByKey(GroupingOptions options);
+
+ /**
+ * Writes this {@code PTable} to the given {@code Target}.
+ */
+ PTable<K, V> write(Target target);
+
+ /**
+ * Writes this {@code PTable} to the given {@code Target}, using the
+ * given {@code Target.WriteMode} to handle existing targets.
+ */
+ PTable<K, V> write(Target target, Target.WriteMode writeMode);
+
+ /**
+ * Returns the {@code PTableType} of this {@code PTable}.
+ */
+ PTableType<K, V> getPTableType();
+
+ /**
+ * Returns the {@code PType} of the key.
+ */
+ PType<K> getKeyType();
+
+ /**
+ * Returns the {@code PType} of the value.
+ */
+ PType<V> getValueType();
+
+ /**
+ * Aggregate all of the values with the same key into a single key-value pair
+ * in the returned PTable.
+ */
+ PTable<K, Collection<V>> collectValues();
+
+ /**
+ * Apply the given filter function to this instance and return the resulting
+ * {@code PTable}.
+ */
+ PTable<K, V> filter(FilterFn<Pair<K, V>> filterFn);
+
+ /**
+ * Apply the given filter function to this instance and return the resulting
+ * {@code PTable}.
+ *
+ * @param name
+ * An identifier for this processing step
+ * @param filterFn
+ * The {@code FilterFn} to apply
+ */
+ PTable<K, V> filter(String name, FilterFn<Pair<K, V>> filterFn);
+
+ /**
+ * Returns a PTable made up of the pairs in this PTable with the largest value
+ * field.
+ *
+ * @param count
+ * The number of pairs to return
+ */
+ PTable<K, V> top(int count);
+
+ /**
+ * Returns a PTable made up of the pairs in this PTable with the smallest
+ * value field.
+ *
+ * @param count
+ * The number of pairs to return
+ */
+ PTable<K, V> bottom(int count);
+
+ /**
+ * Perform an inner join on this table and the one passed in as an argument on
+ * their common keys.
+ */
+ <U> PTable<K, Pair<V, U>> join(PTable<K, U> other);
+
+ /**
+ * Co-group operation with the given table on common keys.
+ */
+ <U> PTable<K, Pair<Collection<V>, Collection<U>>> cogroup(PTable<K, U> other);
+
+ /**
+ * Returns a {@link PCollection} made up of the keys in this PTable.
+ */
+ PCollection<K> keys();
+
+ /**
+ * Returns a {@link PCollection} made up of the values in this PTable.
+ */
+ PCollection<V> values();
+
+ /**
+ * Returns a Map<K, V> made up of the keys and values in this PTable.
+ * <p>
+ * <b>Note:</b> The contents of the returned map may not be exactly the same
+ * as this PTable, as a PTable is a multi-map (i.e. can contain multiple
+ * values for a single key).
+ */
+ Map<K, V> materializeToMap();
+
+ /**
+ * Returns a {@link PObject} encapsulating a {@link Map} made up of the keys and values in this
+ * {@code PTable}.
+ * <p><b>Note:</b>The contents of the returned map may not be exactly the same as this PTable,
+ * as a PTable is a multi-map (i.e. can contain multiple values for a single key).
+ * </p>
+ *
+ * @return The {@code PObject} encapsulating a {@code Map} made up of the keys and values in
+ * this {@code PTable}.
+ */
+ PObject<Map<K, V>> asMap();
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/Pair.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/Pair.java b/crunch-core/src/main/java/org/apache/crunch/Pair.java
new file mode 100644
index 0000000..fd058b6
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/Pair.java
@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+
+/**
+ * A convenience class for two-element {@link Tuple}s.
+ */
+public class Pair<K, V> implements Tuple, Comparable<Pair<K, V>> {
+
+ private final K first;
+ private final V second;
+
+ public static <T, U> Pair<T, U> of(T first, U second) {
+ return new Pair<T, U>(first, second);
+ }
+
+ public Pair(K first, V second) {
+ this.first = first;
+ this.second = second;
+ }
+
+ public K first() {
+ return first;
+ }
+
+ public V second() {
+ return second;
+ }
+
+ public Object get(int index) {
+ switch (index) {
+ case 0:
+ return first;
+ case 1:
+ return second;
+ default:
+ throw new ArrayIndexOutOfBoundsException();
+ }
+ }
+
+ public int size() {
+ return 2;
+ }
+
+ @Override
+ public int hashCode() {
+ HashCodeBuilder hcb = new HashCodeBuilder();
+ return hcb.append(first).append(second).toHashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ Pair<?, ?> other = (Pair<?, ?>) obj;
+ return (first == other.first || (first != null && first.equals(other.first)))
+ && (second == other.second || (second != null && second.equals(other.second)));
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder("[");
+ sb.append(first).append(",").append(second).append("]");
+ return sb.toString();
+ }
+
+ private int cmp(Object lhs, Object rhs) {
+ if (lhs == rhs) {
+ return 0;
+ } else if (lhs != null && Comparable.class.isAssignableFrom(lhs.getClass())) {
+ return ((Comparable) lhs).compareTo(rhs);
+ }
+ return (lhs == null ? 0 : lhs.hashCode()) - (rhs == null ? 0 : rhs.hashCode());
+ }
+
+ @Override
+ public int compareTo(Pair<K, V> o) {
+ int diff = cmp(first, o.first);
+ if (diff == 0) {
+ diff = cmp(second, o.second);
+ }
+ return diff;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/ParallelDoOptions.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/ParallelDoOptions.java b/crunch-core/src/main/java/org/apache/crunch/ParallelDoOptions.java
new file mode 100644
index 0000000..2407b3a
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/ParallelDoOptions.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import java.util.Collections;
+import java.util.Set;
+
+import com.google.common.collect.Sets;
+
+/**
+ * Container class that includes optional information about a {@code parallelDo} operation
+ * applied to a {@code PCollection}. Primarily used within the Crunch framework
+ * itself for certain types of advanced processing operations, such as in-memory joins
+ * that require reading a file from the filesystem into a {@code DoFn}.
+ */
+public class ParallelDoOptions {
+ private final Set<SourceTarget<?>> sourceTargets;
+
+ private ParallelDoOptions(Set<SourceTarget<?>> sourceTargets) {
+ this.sourceTargets = sourceTargets;
+ }
+
+ public Set<SourceTarget<?>> getSourceTargets() {
+ return sourceTargets;
+ }
+
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ public static class Builder {
+ private Set<SourceTarget<?>> sourceTargets;
+
+ public Builder() {
+ this.sourceTargets = Sets.newHashSet();
+ }
+
+ public Builder sourceTargets(SourceTarget<?>... sourceTargets) {
+ Collections.addAll(this.sourceTargets, sourceTargets);
+ return this;
+ }
+
+ public ParallelDoOptions build() {
+ return new ParallelDoOptions(sourceTargets);
+ }
+ }
+}
[12/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/collect/PCollectionImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/PCollectionImpl.java b/crunch/src/main/java/org/apache/crunch/impl/mr/collect/PCollectionImpl.java
deleted file mode 100644
index 6ea9c4c..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/PCollectionImpl.java
+++ /dev/null
@@ -1,295 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.collect;
-
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.DoFn;
-import org.apache.crunch.FilterFn;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PObject;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.ParallelDoOptions;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.Target;
-import org.apache.crunch.fn.ExtractKeyFn;
-import org.apache.crunch.fn.IdentityFn;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.impl.mr.plan.DoNode;
-import org.apache.crunch.lib.Aggregate;
-import org.apache.crunch.materialize.pobject.CollectionPObject;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-
-public abstract class PCollectionImpl<S> implements PCollection<S> {
-
- private static final Log LOG = LogFactory.getLog(PCollectionImpl.class);
-
- private final String name;
- protected MRPipeline pipeline;
- protected SourceTarget<S> materializedAt;
- private final ParallelDoOptions options;
-
- public PCollectionImpl(String name) {
- this(name, ParallelDoOptions.builder().build());
- }
-
- public PCollectionImpl(String name, ParallelDoOptions options) {
- this.name = name;
- this.options = options;
- }
-
- @Override
- public String getName() {
- return name;
- }
-
- @Override
- public String toString() {
- return getName();
- }
-
- @Override
- public PCollection<S> union(PCollection<S> other) {
- return union(new PCollection[] { other });
- }
-
- @Override
- public PCollection<S> union(PCollection<S>... collections) {
- List<PCollectionImpl<S>> internal = Lists.newArrayList();
- internal.add(this);
- for (PCollection<S> collection : collections) {
- internal.add((PCollectionImpl<S>) collection.parallelDo(IdentityFn.<S>getInstance(), collection.getPType()));
- }
- return new UnionCollection<S>(internal);
- }
-
- @Override
- public <T> PCollection<T> parallelDo(DoFn<S, T> fn, PType<T> type) {
- MRPipeline pipeline = (MRPipeline) getPipeline();
- return parallelDo("S" + pipeline.getNextAnonymousStageId(), fn, type);
- }
-
- @Override
- public <T> PCollection<T> parallelDo(String name, DoFn<S, T> fn, PType<T> type) {
- return new DoCollectionImpl<T>(name, getChainingCollection(), fn, type);
- }
-
- @Override
- public <T> PCollection<T> parallelDo(String name, DoFn<S, T> fn, PType<T> type,
- ParallelDoOptions options) {
- return new DoCollectionImpl<T>(name, getChainingCollection(), fn, type, options);
- }
-
- @Override
- public <K, V> PTable<K, V> parallelDo(DoFn<S, Pair<K, V>> fn, PTableType<K, V> type) {
- MRPipeline pipeline = (MRPipeline) getPipeline();
- return parallelDo("S" + pipeline.getNextAnonymousStageId(), fn, type);
- }
-
- @Override
- public <K, V> PTable<K, V> parallelDo(String name, DoFn<S, Pair<K, V>> fn, PTableType<K, V> type) {
- return new DoTableImpl<K, V>(name, getChainingCollection(), fn, type);
- }
-
- @Override
- public <K, V> PTable<K, V> parallelDo(String name, DoFn<S, Pair<K, V>> fn, PTableType<K, V> type,
- ParallelDoOptions options) {
- return new DoTableImpl<K, V>(name, getChainingCollection(), fn, type, options);
- }
-
- public PCollection<S> write(Target target) {
- if (materializedAt != null) {
- getPipeline().write(new InputCollection<S>(materializedAt, (MRPipeline) getPipeline()), target);
- } else {
- getPipeline().write(this, target);
- }
- return this;
- }
-
- @Override
- public PCollection<S> write(Target target, Target.WriteMode writeMode) {
- if (materializedAt != null) {
- getPipeline().write(new InputCollection<S>(materializedAt, (MRPipeline) getPipeline()), target,
- writeMode);
- } else {
- getPipeline().write(this, target, writeMode);
- }
- return this;
- }
-
- @Override
- public Iterable<S> materialize() {
- if (getSize() == 0) {
- LOG.warn("Materializing an empty PCollection: " + this.getName());
- return Collections.emptyList();
- }
- return getPipeline().materialize(this);
- }
-
- /** {@inheritDoc} */
- @Override
- public PObject<Collection<S>> asCollection() {
- return new CollectionPObject<S>(this);
- }
-
- public SourceTarget<S> getMaterializedAt() {
- return materializedAt;
- }
-
- public void materializeAt(SourceTarget<S> sourceTarget) {
- this.materializedAt = sourceTarget;
- }
-
- @Override
- public PCollection<S> filter(FilterFn<S> filterFn) {
- return parallelDo(filterFn, getPType());
- }
-
- @Override
- public PCollection<S> filter(String name, FilterFn<S> filterFn) {
- return parallelDo(name, filterFn, getPType());
- }
-
- @Override
- public <K> PTable<K, S> by(MapFn<S, K> mapFn, PType<K> keyType) {
- return parallelDo(new ExtractKeyFn<K, S>(mapFn), getTypeFamily().tableOf(keyType, getPType()));
- }
-
- @Override
- public <K> PTable<K, S> by(String name, MapFn<S, K> mapFn, PType<K> keyType) {
- return parallelDo(name, new ExtractKeyFn<K, S>(mapFn), getTypeFamily().tableOf(keyType, getPType()));
- }
-
- @Override
- public PTable<S, Long> count() {
- return Aggregate.count(this);
- }
-
- @Override
- public PObject<Long> length() {
- return Aggregate.length(this);
- }
-
- @Override
- public PObject<S> max() {
- return Aggregate.max(this);
- }
-
- @Override
- public PObject<S> min() {
- return Aggregate.min(this);
- }
-
- @Override
- public PTypeFamily getTypeFamily() {
- return getPType().getFamily();
- }
-
- public abstract DoNode createDoNode();
-
- public abstract List<PCollectionImpl<?>> getParents();
-
- public PCollectionImpl<?> getOnlyParent() {
- List<PCollectionImpl<?>> parents = getParents();
- if (parents.size() != 1) {
- throw new IllegalArgumentException("Expected exactly one parent PCollection");
- }
- return parents.get(0);
- }
-
- @Override
- public Pipeline getPipeline() {
- if (pipeline == null) {
- pipeline = (MRPipeline) getParents().get(0).getPipeline();
- }
- return pipeline;
- }
-
- public Set<SourceTarget<?>> getTargetDependencies() {
- Set<SourceTarget<?>> targetDeps = options.getSourceTargets();
- for (PCollectionImpl<?> parent : getParents()) {
- targetDeps = Sets.union(targetDeps, parent.getTargetDependencies());
- }
- return targetDeps;
- }
-
- public int getDepth() {
- int parentMax = 0;
- for (PCollectionImpl parent : getParents()) {
- parentMax = Math.max(parent.getDepth(), parentMax);
- }
- return 1 + parentMax;
- }
-
- public interface Visitor {
- void visitInputCollection(InputCollection<?> collection);
-
- void visitUnionCollection(UnionCollection<?> collection);
-
- void visitDoFnCollection(DoCollectionImpl<?> collection);
-
- void visitDoTable(DoTableImpl<?, ?> collection);
-
- void visitGroupedTable(PGroupedTableImpl<?, ?> collection);
- }
-
- public void accept(Visitor visitor) {
- if (materializedAt != null) {
- visitor.visitInputCollection(new InputCollection<S>(materializedAt, (MRPipeline) getPipeline()));
- } else {
- acceptInternal(visitor);
- }
- }
-
- protected abstract void acceptInternal(Visitor visitor);
-
- @Override
- public long getSize() {
- if (materializedAt != null) {
- long sz = materializedAt.getSize(getPipeline().getConfiguration());
- if (sz > 0) {
- return sz;
- }
- }
- return getSizeInternal();
- }
-
- protected abstract long getSizeInternal();
-
- /**
- * Retrieve the PCollectionImpl to be used for chaining within PCollectionImpls further down the pipeline.
- * @return The PCollectionImpl instance to be chained
- */
- protected PCollectionImpl<S> getChainingCollection(){
- return this;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/collect/PGroupedTableImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/PGroupedTableImpl.java b/crunch/src/main/java/org/apache/crunch/impl/mr/collect/PGroupedTableImpl.java
deleted file mode 100644
index ccac5d5..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/PGroupedTableImpl.java
+++ /dev/null
@@ -1,144 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.collect;
-
-import java.util.List;
-import java.util.Set;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.Aggregator;
-import org.apache.crunch.CombineFn;
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.GroupingOptions;
-import org.apache.crunch.PGroupedTable;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.fn.Aggregators;
-import org.apache.crunch.impl.mr.plan.DoNode;
-import org.apache.crunch.types.PGroupedTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.util.PartitionUtils;
-import org.apache.hadoop.mapreduce.Job;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableSet;
-import com.google.common.collect.Sets;
-
-public class PGroupedTableImpl<K, V> extends PCollectionImpl<Pair<K, Iterable<V>>> implements PGroupedTable<K, V> {
-
- private static final Log LOG = LogFactory.getLog(PGroupedTableImpl.class);
-
- private final PTableBase<K, V> parent;
- private final GroupingOptions groupingOptions;
- private final PGroupedTableType<K, V> ptype;
-
- PGroupedTableImpl(PTableBase<K, V> parent) {
- this(parent, null);
- }
-
- PGroupedTableImpl(PTableBase<K, V> parent, GroupingOptions groupingOptions) {
- super("GBK");
- this.parent = parent;
- this.groupingOptions = groupingOptions;
- this.ptype = parent.getPTableType().getGroupedTableType();
- }
-
- public void configureShuffle(Job job) {
- ptype.configureShuffle(job, groupingOptions);
- if (groupingOptions == null || groupingOptions.getNumReducers() <= 0) {
- int numReduceTasks = PartitionUtils.getRecommendedPartitions(this, getPipeline().getConfiguration());
- if (numReduceTasks > 0) {
- job.setNumReduceTasks(numReduceTasks);
- LOG.info(String.format("Setting num reduce tasks to %d", numReduceTasks));
- } else {
- LOG.warn("Attempted to set a negative number of reduce tasks");
- }
- }
- }
-
- @Override
- protected long getSizeInternal() {
- return parent.getSizeInternal();
- }
-
- @Override
- public PType<Pair<K, Iterable<V>>> getPType() {
- return ptype;
- }
-
- @Override
- public PTable<K, V> combineValues(CombineFn<K, V> combineFn) {
- return new DoTableImpl<K, V>("combine", getChainingCollection(), combineFn, parent.getPTableType());
- }
-
- @Override
- public PTable<K, V> combineValues(Aggregator<V> agg) {
- return combineValues(Aggregators.<K, V>toCombineFn(agg));
- }
-
- private static class Ungroup<K, V> extends DoFn<Pair<K, Iterable<V>>, Pair<K, V>> {
- @Override
- public void process(Pair<K, Iterable<V>> input, Emitter<Pair<K, V>> emitter) {
- for (V v : input.second()) {
- emitter.emit(Pair.of(input.first(), v));
- }
- }
- }
-
- public PTable<K, V> ungroup() {
- return parallelDo("ungroup", new Ungroup<K, V>(), parent.getPTableType());
- }
-
- @Override
- protected void acceptInternal(PCollectionImpl.Visitor visitor) {
- visitor.visitGroupedTable(this);
- }
-
- @Override
- public Set<SourceTarget<?>> getTargetDependencies() {
- Set<SourceTarget<?>> td = Sets.newHashSet(super.getTargetDependencies());
- if (groupingOptions != null) {
- td.addAll(groupingOptions.getSourceTargets());
- }
- return ImmutableSet.copyOf(td);
- }
-
- @Override
- public List<PCollectionImpl<?>> getParents() {
- return ImmutableList.<PCollectionImpl<?>> of(parent);
- }
-
- @Override
- public DoNode createDoNode() {
- return DoNode.createFnNode(getName(), ptype.getInputMapFn(), ptype);
- }
-
- public DoNode getGroupingNode() {
- return DoNode.createGroupingNode("", ptype);
- }
-
- @Override
- protected PCollectionImpl<Pair<K, Iterable<V>>> getChainingCollection() {
- // Use a copy for chaining to allow sending the output of a single grouped table to multiple outputs
- // TODO This should be implemented in a cleaner way in the planner
- return new PGroupedTableImpl<K, V>(parent, groupingOptions);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/collect/PTableBase.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/PTableBase.java b/crunch/src/main/java/org/apache/crunch/impl/mr/collect/PTableBase.java
deleted file mode 100644
index 3c2393d..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/PTableBase.java
+++ /dev/null
@@ -1,169 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.collect;
-
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.crunch.FilterFn;
-import org.apache.crunch.GroupingOptions;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PObject;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.ParallelDoOptions;
-import org.apache.crunch.TableSource;
-import org.apache.crunch.Target;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.lib.Aggregate;
-import org.apache.crunch.lib.Cogroup;
-import org.apache.crunch.lib.Join;
-import org.apache.crunch.lib.PTables;
-import org.apache.crunch.materialize.MaterializableMap;
-import org.apache.crunch.materialize.pobject.MapPObject;
-import org.apache.crunch.types.PType;
-
-import com.google.common.collect.Lists;
-
-abstract class PTableBase<K, V> extends PCollectionImpl<Pair<K, V>> implements PTable<K, V> {
-
- public PTableBase(String name) {
- super(name);
- }
-
- public PTableBase(String name, ParallelDoOptions options) {
- super(name, options);
- }
-
- public PType<K> getKeyType() {
- return getPTableType().getKeyType();
- }
-
- public PType<V> getValueType() {
- return getPTableType().getValueType();
- }
-
- public PGroupedTableImpl<K, V> groupByKey() {
- return new PGroupedTableImpl<K, V>(this);
- }
-
- public PGroupedTableImpl<K, V> groupByKey(int numReduceTasks) {
- return new PGroupedTableImpl<K, V>(this, GroupingOptions.builder().numReducers(numReduceTasks).build());
- }
-
- public PGroupedTableImpl<K, V> groupByKey(GroupingOptions groupingOptions) {
- return new PGroupedTableImpl<K, V>(this, groupingOptions);
- }
-
- @Override
- public PTable<K, V> union(PTable<K, V> other) {
- return union(new PTable[] { other });
- }
-
- @Override
- public PTable<K, V> union(PTable<K, V>... others) {
- List<PTableBase<K, V>> internal = Lists.newArrayList();
- internal.add(this);
- for (PTable<K, V> table : others) {
- internal.add((PTableBase<K, V>) table);
- }
- return new UnionTable<K, V>(internal);
- }
-
- @Override
- public PTable<K, V> write(Target target) {
- if (getMaterializedAt() != null) {
- getPipeline().write(new InputTable<K, V>(
- (TableSource<K, V>) getMaterializedAt(), (MRPipeline) getPipeline()), target);
- } else {
- getPipeline().write(this, target);
- }
- return this;
- }
-
- @Override
- public PTable<K, V> write(Target target, Target.WriteMode writeMode) {
- if (getMaterializedAt() != null) {
- getPipeline().write(new InputTable<K, V>(
- (TableSource<K, V>) getMaterializedAt(), (MRPipeline) getPipeline()), target, writeMode);
- } else {
- getPipeline().write(this, target, writeMode);
- }
- return this;
- }
-
- @Override
- public PTable<K, V> filter(FilterFn<Pair<K, V>> filterFn) {
- return parallelDo(filterFn, getPTableType());
- }
-
- @Override
- public PTable<K, V> filter(String name, FilterFn<Pair<K, V>> filterFn) {
- return parallelDo(name, filterFn, getPTableType());
- }
-
- @Override
- public PTable<K, V> top(int count) {
- return Aggregate.top(this, count, true);
- }
-
- @Override
- public PTable<K, V> bottom(int count) {
- return Aggregate.top(this, count, false);
- }
-
- @Override
- public PTable<K, Collection<V>> collectValues() {
- return Aggregate.collectValues(this);
- }
-
- @Override
- public <U> PTable<K, Pair<V, U>> join(PTable<K, U> other) {
- return Join.join(this, other);
- }
-
- @Override
- public <U> PTable<K, Pair<Collection<V>, Collection<U>>> cogroup(PTable<K, U> other) {
- return Cogroup.cogroup(this, other);
- }
-
- @Override
- public PCollection<K> keys() {
- return PTables.keys(this);
- }
-
- @Override
- public PCollection<V> values() {
- return PTables.values(this);
- }
-
- /**
- * Returns a Map<K, V> made up of the keys and values in this PTable.
- */
- @Override
- public Map<K, V> materializeToMap() {
- return new MaterializableMap<K, V>(this.materialize());
- }
-
- /** {@inheritDoc} */
- @Override
- public PObject<Map<K, V>> asMap() {
- return new MapPObject<K, V>(this);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/collect/UnionCollection.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/UnionCollection.java b/crunch/src/main/java/org/apache/crunch/impl/mr/collect/UnionCollection.java
deleted file mode 100644
index 7b3dd7b..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/UnionCollection.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.collect;
-
-import java.util.List;
-
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.impl.mr.plan.DoNode;
-import org.apache.crunch.types.PType;
-
-import com.google.common.collect.ImmutableList;
-
-public class UnionCollection<S> extends PCollectionImpl<S> {
-
- private List<PCollectionImpl<S>> parents;
- private long size = 0;
-
- private static <S> String flatName(List<PCollectionImpl<S>> collections) {
- StringBuilder sb = new StringBuilder("union(");
- for (int i = 0; i < collections.size(); i++) {
- if (i != 0) {
- sb.append(',');
- }
- sb.append(collections.get(i).getName());
- }
- return sb.append(')').toString();
- }
-
- UnionCollection(List<PCollectionImpl<S>> collections) {
- super(flatName(collections));
- this.parents = ImmutableList.copyOf(collections);
- this.pipeline = (MRPipeline) parents.get(0).getPipeline();
- for (PCollectionImpl<S> parent : parents) {
- if (this.pipeline != parent.getPipeline()) {
- throw new IllegalStateException("Cannot union PCollections from different Pipeline instances");
- }
- size += parent.getSize();
- }
- }
-
- @Override
- protected long getSizeInternal() {
- return size;
- }
-
- @Override
- protected void acceptInternal(PCollectionImpl.Visitor visitor) {
- visitor.visitUnionCollection(this);
- }
-
- @Override
- public PType<S> getPType() {
- return parents.get(0).getPType();
- }
-
- @Override
- public List<PCollectionImpl<?>> getParents() {
- return ImmutableList.<PCollectionImpl<?>> copyOf(parents);
- }
-
- @Override
- public DoNode createDoNode() {
- throw new UnsupportedOperationException("Unioned collection does not support DoNodes");
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/collect/UnionTable.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/UnionTable.java b/crunch/src/main/java/org/apache/crunch/impl/mr/collect/UnionTable.java
deleted file mode 100644
index a369432..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/UnionTable.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.collect;
-
-import java.util.List;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.impl.mr.plan.DoNode;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-
-public class UnionTable<K, V> extends PTableBase<K, V> {
-
- private PTableType<K, V> ptype;
- private List<PCollectionImpl<Pair<K, V>>> parents;
- private long size;
-
- private static <K, V> String flatName(List<PTableBase<K, V>> tables) {
- StringBuilder sb = new StringBuilder("union(");
- for (int i = 0; i < tables.size(); i++) {
- if (i != 0) {
- sb.append(',');
- }
- sb.append(tables.get(i).getName());
- }
- return sb.append(')').toString();
- }
-
- public UnionTable(List<PTableBase<K, V>> tables) {
- super(flatName(tables));
- this.ptype = tables.get(0).getPTableType();
- this.pipeline = (MRPipeline) tables.get(0).getPipeline();
- this.parents = Lists.newArrayList();
- for (PTableBase<K, V> parent : tables) {
- if (pipeline != parent.getPipeline()) {
- throw new IllegalStateException("Cannot union PTables from different Pipeline instances");
- }
- this.parents.add(parent);
- size += parent.getSize();
- }
- }
-
- @Override
- protected long getSizeInternal() {
- return size;
- }
-
- @Override
- public PTableType<K, V> getPTableType() {
- return ptype;
- }
-
- @Override
- public PType<Pair<K, V>> getPType() {
- return ptype;
- }
-
- @Override
- public List<PCollectionImpl<?>> getParents() {
- return ImmutableList.<PCollectionImpl<?>> copyOf(parents);
- }
-
- @Override
- protected void acceptInternal(PCollectionImpl.Visitor visitor) {
- visitor.visitUnionCollection(new UnionCollection<Pair<K, V>>(parents));
- }
-
- @Override
- public DoNode createDoNode() {
- throw new UnsupportedOperationException("Unioned table does not support do nodes");
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/emit/IntermediateEmitter.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/emit/IntermediateEmitter.java b/crunch/src/main/java/org/apache/crunch/impl/mr/emit/IntermediateEmitter.java
deleted file mode 100644
index b6df98b..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/emit/IntermediateEmitter.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.emit;
-
-import java.util.List;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.impl.mr.run.RTNode;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.conf.Configuration;
-
-import com.google.common.collect.ImmutableList;
-
-/**
- * An {@link Emitter} implementation that links the output of one {@link DoFn} to the input of
- * another {@code DoFn}.
- *
- */
-public class IntermediateEmitter implements Emitter<Object> {
-
- private final List<RTNode> children;
- private final Configuration conf;
- private final PType<Object> outputPType;
- private final boolean needDetachedValues;
-
- public IntermediateEmitter(PType<Object> outputPType, List<RTNode> children, Configuration conf) {
- this.outputPType = outputPType;
- this.children = ImmutableList.copyOf(children);
- this.conf = conf;
-
- outputPType.initialize(conf);
- needDetachedValues = this.children.size() > 1;
- }
-
- public void emit(Object emitted) {
- for (RTNode child : children) {
- Object value = emitted;
- if (needDetachedValues) {
- value = this.outputPType.getDetachedValue(emitted);
- }
- child.process(value);
- }
- }
-
- public void flush() {
- // No-op
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/emit/MultipleOutputEmitter.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/emit/MultipleOutputEmitter.java b/crunch/src/main/java/org/apache/crunch/impl/mr/emit/MultipleOutputEmitter.java
deleted file mode 100644
index 2e58fed..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/emit/MultipleOutputEmitter.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.emit;
-
-import java.io.IOException;
-
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.io.CrunchOutputs;
-import org.apache.crunch.types.Converter;
-
-public class MultipleOutputEmitter<T, K, V> implements Emitter<T> {
-
- private final Converter converter;
- private final CrunchOutputs<K, V> outputs;
- private final String outputName;
-
- public MultipleOutputEmitter(Converter converter, CrunchOutputs<K, V> outputs,
- String outputName) {
- this.converter = converter;
- this.outputs = outputs;
- this.outputName = outputName;
- }
-
- @Override
- public void emit(T emitted) {
- try {
- this.outputs.write(outputName,
- (K) converter.outputKey(emitted),
- (V) converter.outputValue(emitted));
- } catch (Exception e) {
- throw new CrunchRuntimeException(e);
- }
- }
-
- @Override
- public void flush() {
- // No-op
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/emit/OutputEmitter.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/emit/OutputEmitter.java b/crunch/src/main/java/org/apache/crunch/impl/mr/emit/OutputEmitter.java
deleted file mode 100644
index bc3ae0d..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/emit/OutputEmitter.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.emit;
-
-import java.io.IOException;
-
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.types.Converter;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-
-public class OutputEmitter<T, K, V> implements Emitter<T> {
-
- private final Converter<K, V, Object, Object> converter;
- private final TaskInputOutputContext<?, ?, K, V> context;
-
- public OutputEmitter(Converter<K, V, Object, Object> converter, TaskInputOutputContext<?, ?, K, V> context) {
- this.converter = converter;
- this.context = context;
- }
-
- public void emit(T emitted) {
- try {
- K key = converter.outputKey(emitted);
- V value = converter.outputValue(emitted);
- this.context.write(key, value);
- } catch (IOException e) {
- throw new CrunchRuntimeException(e);
- } catch (InterruptedException e) {
- throw new CrunchRuntimeException(e);
- }
- }
-
- public void flush() {
- // No-op
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounter.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounter.java b/crunch/src/main/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounter.java
deleted file mode 100644
index d90f2e8..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounter.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.exec;
-
-/**
- * Generate a series of capped numbers exponentially.
- *
- * It is used for creating retry intervals. It is NOT thread-safe.
- */
-public class CappedExponentialCounter {
-
- private long current;
- private final long limit;
-
- public CappedExponentialCounter(long start, long limit) {
- this.current = start;
- this.limit = limit;
- }
-
- public long get() {
- long result = current;
- current = Math.min(current * 2, limit);
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/exec/CrunchJobHooks.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/exec/CrunchJobHooks.java b/crunch/src/main/java/org/apache/crunch/impl/mr/exec/CrunchJobHooks.java
deleted file mode 100644
index 74bc9ac..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/exec/CrunchJobHooks.java
+++ /dev/null
@@ -1,153 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.exec;
-
-import java.io.IOException;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchControlledJob;
-import org.apache.crunch.impl.mr.plan.PlanningParameters;
-import org.apache.crunch.impl.mr.run.RuntimeParameters;
-import org.apache.crunch.io.FileNamingScheme;
-import org.apache.crunch.io.PathTarget;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.FileUtil;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-
-public final class CrunchJobHooks {
-
- private CrunchJobHooks() {}
-
- /** Creates missing input directories before job is submitted. */
- public static final class PrepareHook implements CrunchControlledJob.Hook {
- private final Job job;
-
- public PrepareHook(Job job) {
- this.job = job;
- }
-
- @Override
- public void run() throws IOException {
- Configuration conf = job.getConfiguration();
- if (conf.getBoolean(RuntimeParameters.CREATE_DIR, false)) {
- Path[] inputPaths = FileInputFormat.getInputPaths(job);
- for (Path inputPath : inputPaths) {
- FileSystem fs = inputPath.getFileSystem(conf);
- if (!fs.exists(inputPath)) {
- try {
- fs.mkdirs(inputPath);
- } catch (IOException e) {
- }
- }
- }
- }
- }
- }
-
- /** Moving output files produced by the MapReduce job to specified directories. */
- public static final class CompletionHook implements CrunchControlledJob.Hook {
- private final Job job;
- private final Path workingPath;
- private final Map<Integer, PathTarget> multiPaths;
- private final boolean mapOnlyJob;
-
- public CompletionHook(Job job, Path workingPath, Map<Integer, PathTarget> multiPaths, boolean mapOnlyJob) {
- this.job = job;
- this.workingPath = workingPath;
- this.multiPaths = multiPaths;
- this.mapOnlyJob = mapOnlyJob;
- }
-
- @Override
- public void run() throws IOException {
- handleMultiPaths();
- }
-
- private synchronized void handleMultiPaths() throws IOException {
- if (!multiPaths.isEmpty()) {
- // Need to handle moving the data from the output directory of the
- // job to the output locations specified in the paths.
- FileSystem srcFs = workingPath.getFileSystem(job.getConfiguration());
- for (Map.Entry<Integer, PathTarget> entry : multiPaths.entrySet()) {
- final int i = entry.getKey();
- final Path dst = entry.getValue().getPath();
- FileNamingScheme fileNamingScheme = entry.getValue().getFileNamingScheme();
-
- Path src = new Path(workingPath, PlanningParameters.MULTI_OUTPUT_PREFIX + i + "-*");
- Path[] srcs = FileUtil.stat2Paths(srcFs.globStatus(src), src);
- Configuration conf = job.getConfiguration();
- FileSystem dstFs = dst.getFileSystem(conf);
- if (!dstFs.exists(dst)) {
- dstFs.mkdirs(dst);
- }
- boolean sameFs = isCompatible(srcFs, dst);
- for (Path s : srcs) {
- Path d = getDestFile(conf, s, dst, fileNamingScheme);
- if (sameFs) {
- srcFs.rename(s, d);
- } else {
- FileUtil.copy(srcFs, s, dstFs, d, true, true, job.getConfiguration());
- }
- }
- }
- }
- }
-
- private boolean isCompatible(FileSystem fs, Path path) {
- try {
- fs.makeQualified(path);
- return true;
- } catch (IllegalArgumentException e) {
- return false;
- }
- }
- private Path getDestFile(Configuration conf, Path src, Path dir, FileNamingScheme fileNamingScheme)
- throws IOException {
- String outputFilename = null;
- if (mapOnlyJob) {
- outputFilename = fileNamingScheme.getMapOutputName(conf, dir);
- } else {
- outputFilename = fileNamingScheme.getReduceOutputName(conf, dir, extractPartitionNumber(src.getName()));
- }
- if (src.getName().endsWith(org.apache.avro.mapred.AvroOutputFormat.EXT)) {
- outputFilename += org.apache.avro.mapred.AvroOutputFormat.EXT;
- }
- return new Path(dir, outputFilename);
- }
- }
-
- /**
- * Extract the partition number from a raw reducer output filename.
- *
- * @param reduceOutputFileName The raw reducer output file name
- * @return The partition number encoded in the filename
- */
- static int extractPartitionNumber(String reduceOutputFileName) {
- Matcher matcher = Pattern.compile(".*-r-(\\d{5})").matcher(reduceOutputFileName);
- if (matcher.find()) {
- return Integer.parseInt(matcher.group(1), 10);
- } else {
- throw new IllegalArgumentException("Reducer output name '" + reduceOutputFileName + "' cannot be parsed");
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/exec/MRExecutor.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/exec/MRExecutor.java b/crunch/src/main/java/org/apache/crunch/impl/mr/exec/MRExecutor.java
deleted file mode 100644
index 4c7b7ea..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/exec/MRExecutor.java
+++ /dev/null
@@ -1,198 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.exec;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicReference;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.PipelineExecution;
-import org.apache.crunch.PipelineResult;
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.Target;
-import org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchControlledJob;
-import org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchJobControl;
-import org.apache.crunch.impl.mr.collect.PCollectionImpl;
-import org.apache.crunch.materialize.MaterializableIterable;
-import org.apache.hadoop.conf.Configuration;
-
-import com.google.common.collect.Lists;
-
-/**
- * Provides APIs for job control at runtime to clients.
- *
- * This class has a thread that submits jobs when they become ready, monitors
- * the states of the running jobs, and updates the states of jobs based on the
- * state changes of their depending jobs states.
- *
- * It is thread-safe.
- */
-public class MRExecutor implements PipelineExecution {
-
- private static final Log LOG = LogFactory.getLog(MRExecutor.class);
-
- private final CrunchJobControl control;
- private final Map<PCollectionImpl<?>, Set<Target>> outputTargets;
- private final Map<PCollectionImpl<?>, MaterializableIterable> toMaterialize;
- private final CountDownLatch doneSignal = new CountDownLatch(1);
- private final CountDownLatch killSignal = new CountDownLatch(1);
- private final CappedExponentialCounter pollInterval;
- private AtomicReference<Status> status = new AtomicReference<Status>(Status.READY);
- private PipelineResult result;
- private Thread monitorThread;
-
- private String planDotFile;
-
- public MRExecutor(Class<?> jarClass, Map<PCollectionImpl<?>, Set<Target>> outputTargets,
- Map<PCollectionImpl<?>, MaterializableIterable> toMaterialize) {
- this.control = new CrunchJobControl(jarClass.toString());
- this.outputTargets = outputTargets;
- this.toMaterialize = toMaterialize;
- this.monitorThread = new Thread(new Runnable() {
- @Override
- public void run() {
- monitorLoop();
- }
- });
- this.pollInterval = isLocalMode()
- ? new CappedExponentialCounter(50, 1000)
- : new CappedExponentialCounter(500, 10000);
- }
-
- public void addJob(CrunchControlledJob job) {
- this.control.addJob(job);
- }
-
- public void setPlanDotFile(String planDotFile) {
- this.planDotFile = planDotFile;
- }
-
- public PipelineExecution execute() {
- monitorThread.start();
- return this;
- }
-
- /** Monitors running status. It is called in {@code MonitorThread}. */
- private void monitorLoop() {
- try {
- while (killSignal.getCount() > 0 && !control.allFinished()) {
- control.pollJobStatusAndStartNewOnes();
- killSignal.await(pollInterval.get(), TimeUnit.MILLISECONDS);
- }
- control.killAllRunningJobs();
-
- List<CrunchControlledJob> failures = control.getFailedJobList();
- if (!failures.isEmpty()) {
- System.err.println(failures.size() + " job failure(s) occurred:");
- for (CrunchControlledJob job : failures) {
- System.err.println(job.getJobName() + "(" + job.getJobID() + "): " + job.getMessage());
- }
- }
- List<PipelineResult.StageResult> stages = Lists.newArrayList();
- for (CrunchControlledJob job : control.getSuccessfulJobList()) {
- stages.add(new PipelineResult.StageResult(job.getJobName(), job.getJob().getCounters()));
- }
-
- for (PCollectionImpl<?> c : outputTargets.keySet()) {
- if (toMaterialize.containsKey(c)) {
- MaterializableIterable iter = toMaterialize.get(c);
- if (iter.isSourceTarget()) {
- iter.materialize();
- c.materializeAt((SourceTarget) iter.getSource());
- }
- } else {
- boolean materialized = false;
- for (Target t : outputTargets.get(c)) {
- if (!materialized) {
- if (t instanceof SourceTarget) {
- c.materializeAt((SourceTarget) t);
- materialized = true;
- } else {
- SourceTarget st = t.asSourceTarget(c.getPType());
- if (st != null) {
- c.materializeAt(st);
- materialized = true;
- }
- }
- }
- }
- }
- }
-
- synchronized (this) {
- result = new PipelineResult(stages);
- if (killSignal.getCount() == 0) {
- status.set(Status.KILLED);
- } else {
- status.set(result.succeeded() ? Status.SUCCEEDED : Status.FAILED);
- }
- }
- } catch (InterruptedException e) {
- throw new AssertionError(e); // Nobody should interrupt us.
- } catch (IOException e) {
- LOG.error("Pipeline failed due to exception", e);
- status.set(Status.FAILED);
- } finally {
- doneSignal.countDown();
- }
- }
-
- @Override
- public String getPlanDotFile() {
- return planDotFile;
- }
-
- @Override
- public void waitFor(long timeout, TimeUnit timeUnit) throws InterruptedException {
- doneSignal.await(timeout, timeUnit);
- }
-
- @Override
- public void waitUntilDone() throws InterruptedException {
- doneSignal.await();
- }
-
- @Override
- public synchronized Status getStatus() {
- return status.get();
- }
-
- @Override
- public synchronized PipelineResult getResult() {
- return result;
- }
-
- @Override
- public void kill() throws InterruptedException {
- killSignal.countDown();
- }
-
- private static boolean isLocalMode() {
- Configuration conf = new Configuration();
- // Try to handle MapReduce version 0.20 or 0.22
- String jobTrackerAddress = conf.get("mapreduce.jobtracker.address",
- conf.get("mapred.job.tracker", "local"));
- return "local".equals(jobTrackerAddress);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/package-info.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/package-info.java b/crunch/src/main/java/org/apache/crunch/impl/mr/package-info.java
deleted file mode 100644
index 7e403c3..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * A Pipeline implementation that runs on Hadoop MapReduce.
- */
-package org.apache.crunch.impl.mr;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/plan/DoNode.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/DoNode.java b/crunch/src/main/java/org/apache/crunch/impl/mr/plan/DoNode.java
deleted file mode 100644
index 865369c..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/DoNode.java
+++ /dev/null
@@ -1,163 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.plan;
-
-import java.util.List;
-
-import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Source;
-import org.apache.crunch.impl.mr.run.NodeContext;
-import org.apache.crunch.impl.mr.run.RTNode;
-import org.apache.crunch.types.Converter;
-import org.apache.crunch.types.PGroupedTableType;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.conf.Configuration;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-
-public class DoNode {
-
- private static final List<DoNode> NO_CHILDREN = ImmutableList.of();
-
- private final DoFn fn;
- private final String name;
- private final PType<?> ptype;
- private final List<DoNode> children;
- private final Converter outputConverter;
- private final Source<?> source;
- private String outputName;
-
- private DoNode(DoFn fn, String name, PType<?> ptype, List<DoNode> children, Converter outputConverter,
- Source<?> source) {
- this.fn = fn;
- this.name = name;
- this.ptype = ptype;
- this.children = children;
- this.outputConverter = outputConverter;
- this.source = source;
- }
-
- private static List<DoNode> allowsChildren() {
- return Lists.newArrayList();
- }
-
- public static <K, V> DoNode createGroupingNode(String name, PGroupedTableType<K, V> ptype) {
- DoFn<?, ?> fn = ptype.getOutputMapFn();
- return new DoNode(fn, name, ptype, NO_CHILDREN, ptype.getGroupingConverter(), null);
- }
-
- public static <S> DoNode createOutputNode(String name, PType<S> ptype) {
- Converter outputConverter = ptype.getConverter();
- DoFn<?, ?> fn = ptype.getOutputMapFn();
- return new DoNode(fn, name, ptype, NO_CHILDREN, outputConverter, null);
- }
-
- public static DoNode createFnNode(String name, DoFn<?, ?> function, PType<?> ptype) {
- return new DoNode(function, name, ptype, allowsChildren(), null, null);
- }
-
- public static <S> DoNode createInputNode(Source<S> source) {
- PType<?> ptype = source.getType();
- DoFn<?, ?> fn = ptype.getInputMapFn();
- return new DoNode(fn, source.toString(), ptype, allowsChildren(), null, source);
- }
-
- public boolean isInputNode() {
- return source != null;
- }
-
- public boolean isOutputNode() {
- return outputConverter != null;
- }
-
- public String getName() {
- return name;
- }
-
- public List<DoNode> getChildren() {
- return children;
- }
-
- public Source<?> getSource() {
- return source;
- }
-
- public PType<?> getPType() {
- return ptype;
- }
-
- public DoNode addChild(DoNode node) {
- // TODO: This is sort of terrible, refactor the code to make this make more sense.
- boolean exists = false;
- for (DoNode child : children) {
- if (node == child) {
- exists = true;
- break;
- }
- }
- if (!exists) {
- children.add(node);
- }
- return this;
- }
-
- public void setOutputName(String outputName) {
- if (outputConverter == null) {
- throw new IllegalStateException("Cannot set output name w/o output converter: " + outputName);
- }
- this.outputName = outputName;
- }
-
- public RTNode toRTNode(boolean inputNode, Configuration conf, NodeContext nodeContext) {
- List<RTNode> childRTNodes = Lists.newArrayList();
- fn.configure(conf);
- for (DoNode child : children) {
- childRTNodes.add(child.toRTNode(false, conf, nodeContext));
- }
-
- Converter inputConverter = null;
- if (inputNode) {
- if (nodeContext == NodeContext.MAP) {
- inputConverter = ptype.getConverter();
- } else {
- inputConverter = ((PGroupedTableType<?, ?>) ptype).getGroupingConverter();
- }
- }
- return new RTNode(fn, (PType<Object>) getPType(), name, childRTNodes, inputConverter, outputConverter, outputName);
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !(other instanceof DoNode)) {
- return false;
- }
- if (this == other) {
- return true;
- }
- DoNode o = (DoNode) other;
- return (name.equals(o.name) && fn.equals(o.fn) && source == o.source && outputConverter == o.outputConverter);
- }
-
- @Override
- public int hashCode() {
- HashCodeBuilder hcb = new HashCodeBuilder();
- return hcb.append(name).append(fn).append(source).append(outputConverter).toHashCode();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/plan/DotfileWriter.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/DotfileWriter.java b/crunch/src/main/java/org/apache/crunch/impl/mr/plan/DotfileWriter.java
deleted file mode 100644
index 46d8c53..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/DotfileWriter.java
+++ /dev/null
@@ -1,238 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.plan;
-
-import java.util.List;
-import java.util.Set;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.Target;
-import org.apache.crunch.impl.mr.collect.InputCollection;
-import org.apache.crunch.impl.mr.collect.PCollectionImpl;
-import org.apache.crunch.impl.mr.collect.PGroupedTableImpl;
-
-import com.google.common.base.Joiner;
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-
-/**
- * Writes <a href="http://www.graphviz.org">Graphviz</a> dot files to illustrate
- * the topology of Crunch pipelines.
- */
-public class DotfileWriter {
-
- /** The types of tasks within a MapReduce job. */
- enum MRTaskType { MAP, REDUCE };
-
- private Set<JobPrototype> jobPrototypes = Sets.newHashSet();
- private HashMultimap<Pair<JobPrototype, MRTaskType>, String> jobNodeDeclarations = HashMultimap.create();
- private Set<String> globalNodeDeclarations = Sets.newHashSet();
- private Set<String> nodePathChains = Sets.newHashSet();
-
- /**
- * Format the declaration of a node based on a PCollection.
- *
- * @param pcollectionImpl PCollection for which a node will be declared
- * @param jobPrototype The job containing the PCollection
- * @return The node declaration
- */
- String formatPCollectionNodeDeclaration(PCollectionImpl<?> pcollectionImpl, JobPrototype jobPrototype) {
- String shape = "box";
- if (pcollectionImpl instanceof InputCollection) {
- shape = "folder";
- }
- return String.format("%s [label=\"%s\" shape=%s];", formatPCollection(pcollectionImpl, jobPrototype), pcollectionImpl.getName(),
- shape);
- }
-
- /**
- * Format a Target as a node declaration.
- *
- * @param target A Target used within a MapReduce pipeline
- * @return The global node declaration for the Target
- */
- String formatTargetNodeDeclaration(Target target) {
- return String.format("\"%s\" [label=\"%s\" shape=folder];", target.toString(), target.toString());
- }
-
- /**
- * Format a PCollectionImpl into a format to be used for dot files.
- *
- * @param pcollectionImpl The PCollectionImpl to be formatted
- * @param jobPrototype The job containing the PCollection
- * @return The dot file formatted representation of the PCollectionImpl
- */
- String formatPCollection(PCollectionImpl<?> pcollectionImpl, JobPrototype jobPrototype) {
- if (pcollectionImpl instanceof InputCollection) {
- InputCollection<?> inputCollection = (InputCollection<?>) pcollectionImpl;
- return String.format("\"%s\"", inputCollection.getSource());
- }
- return String.format("\"%s@%d@%d\"", pcollectionImpl.getName(), pcollectionImpl.hashCode(), jobPrototype.hashCode());
- }
-
- /**
- * Format a collection of node strings into dot file syntax.
- *
- * @param nodeCollection Collection of chained node strings
- * @return The dot-formatted chain of nodes
- */
- String formatNodeCollection(List<String> nodeCollection) {
- return String.format("%s;", Joiner.on(" -> ").join(nodeCollection));
- }
-
- /**
- * Format a NodePath in dot file syntax.
- *
- * @param nodePath The node path to be formatted
- * @param jobPrototype The job containing the NodePath
- * @return The dot file representation of the node path
- */
- List<String> formatNodePath(NodePath nodePath, JobPrototype jobPrototype) {
- List<String> formattedNodePaths = Lists.newArrayList();
-
- List<PCollectionImpl<?>> pcollections = Lists.newArrayList(nodePath);
- for (int collectionIndex = 1; collectionIndex < pcollections.size(); collectionIndex++){
- String fromNode = formatPCollection(pcollections.get(collectionIndex - 1), jobPrototype);
- String toNode = formatPCollection(pcollections.get(collectionIndex), jobPrototype);
- formattedNodePaths.add(formatNodeCollection(Lists.newArrayList(fromNode, toNode)));
- }
- return formattedNodePaths;
- }
-
- /**
- * Add a NodePath to be formatted as a list of node declarations within a
- * single job.
- *
- * @param jobPrototype The job containing the node path
- * @param nodePath The node path to be formatted
- */
- void addNodePathDeclarations(JobPrototype jobPrototype, NodePath nodePath) {
- boolean groupingEncountered = false;
- for (PCollectionImpl<?> pcollectionImpl : nodePath) {
- if (pcollectionImpl instanceof InputCollection) {
- globalNodeDeclarations.add(formatPCollectionNodeDeclaration(pcollectionImpl, jobPrototype));
- } else {
- if (!groupingEncountered){
- groupingEncountered = (pcollectionImpl instanceof PGroupedTableImpl);
- }
-
- MRTaskType taskType = groupingEncountered ? MRTaskType.REDUCE : MRTaskType.MAP;
- jobNodeDeclarations.put(Pair.of(jobPrototype, taskType), formatPCollectionNodeDeclaration(pcollectionImpl, jobPrototype));
- }
- }
- }
-
- /**
- * Add the chaining of a NodePath to the graph.
- *
- * @param nodePath The path to be formatted as a node chain in the dot file
- * @param jobPrototype The job containing the NodePath
- */
- void addNodePathChain(NodePath nodePath, JobPrototype jobPrototype) {
- for (String nodePathChain : formatNodePath(nodePath, jobPrototype)){
- this.nodePathChains.add(nodePathChain);
- }
- }
-
- /**
- * Get the graph attributes for a task-specific subgraph.
- *
- * @param taskType The type of task in the subgraph
- * @return Graph attributes
- */
- String getTaskGraphAttributes(MRTaskType taskType) {
- if (taskType == MRTaskType.MAP) {
- return "label = Map; color = blue;";
- } else {
- return "label = Reduce; color = red;";
- }
- }
-
- /**
- * Add the contents of a {@link JobPrototype} to the graph describing a
- * pipeline.
- *
- * @param jobPrototype A JobPrototype representing a portion of a MapReduce
- * pipeline
- */
- public void addJobPrototype(JobPrototype jobPrototype) {
- jobPrototypes.add(jobPrototype);
- if (!jobPrototype.isMapOnly()) {
- for (NodePath nodePath : jobPrototype.getMapNodePaths()) {
- addNodePathDeclarations(jobPrototype, nodePath);
- addNodePathChain(nodePath, jobPrototype);
- }
- }
-
- HashMultimap<Target, NodePath> targetsToNodePaths = jobPrototype.getTargetsToNodePaths();
- for (Target target : targetsToNodePaths.keySet()) {
- globalNodeDeclarations.add(formatTargetNodeDeclaration(target));
- for (NodePath nodePath : targetsToNodePaths.get(target)) {
- addNodePathDeclarations(jobPrototype, nodePath);
- addNodePathChain(nodePath, jobPrototype);
- nodePathChains.add(formatNodeCollection(Lists.newArrayList(formatPCollection(nodePath.descendingIterator()
- .next(), jobPrototype), String.format("\"%s\"", target.toString()))));
- }
- }
- }
-
- /**
- * Build up the full dot file containing the description of a MapReduce
- * pipeline.
- *
- * @return Graphviz dot file contents
- */
- public String buildDotfile() {
- StringBuilder stringBuilder = new StringBuilder();
- stringBuilder.append("digraph G {\n");
- int clusterIndex = 0;
-
- for (String globalDeclaration : globalNodeDeclarations) {
- stringBuilder.append(String.format(" %s\n", globalDeclaration));
- }
-
- for (JobPrototype jobPrototype : jobPrototypes){
- StringBuilder jobProtoStringBuilder = new StringBuilder();
- jobProtoStringBuilder.append(String.format(" subgraph cluster%d {\n", clusterIndex++));
- for (MRTaskType taskType : MRTaskType.values()){
- Pair<JobPrototype,MRTaskType> jobTaskKey = Pair.of(jobPrototype, taskType);
- if (jobNodeDeclarations.containsKey(jobTaskKey)){
- jobProtoStringBuilder.append(String.format(" subgraph cluster%d {\n", clusterIndex++));
- jobProtoStringBuilder.append(String.format(" %s\n", getTaskGraphAttributes(taskType)));
- for (String declarationEntry : jobNodeDeclarations.get(jobTaskKey)){
- jobProtoStringBuilder.append(String.format(" %s\n", declarationEntry));
- }
- jobProtoStringBuilder.append(" }\n");
- }
- }
- jobProtoStringBuilder.append(" }\n");
- stringBuilder.append(jobProtoStringBuilder.toString());
- }
-
- for (String nodePathChain : nodePathChains) {
- stringBuilder.append(String.format(" %s\n", nodePathChain));
- }
-
- stringBuilder.append("}\n");
- return stringBuilder.toString();
- }
-
-
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/plan/Edge.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/Edge.java b/crunch/src/main/java/org/apache/crunch/impl/mr/plan/Edge.java
deleted file mode 100644
index 1e59df0..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/Edge.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.plan;
-
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.commons.lang.builder.ReflectionToStringBuilder;
-import org.apache.commons.lang.builder.ToStringStyle;
-import org.apache.crunch.impl.mr.collect.PCollectionImpl;
-import org.apache.crunch.impl.mr.collect.PGroupedTableImpl;
-
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-
-/**
- *
- */
-class Edge {
- private final Vertex head;
- private final Vertex tail;
- private final Set<NodePath> paths;
-
- public Edge(Vertex head, Vertex tail) {
- this.head = head;
- this.tail = tail;
- this.paths = Sets.newHashSet();
- }
-
- public Vertex getHead() {
- return head;
- }
-
- public Vertex getTail() {
- return tail;
- }
-
- public void addNodePath(NodePath path) {
- this.paths.add(path);
- }
-
- public void addAllNodePaths(Collection<NodePath> paths) {
- this.paths.addAll(paths);
- }
-
- public Set<NodePath> getNodePaths() {
- return paths;
- }
-
- public PCollectionImpl getSplit() {
- List<Iterator<PCollectionImpl<?>>> iters = Lists.newArrayList();
- for (NodePath nodePath : paths) {
- Iterator<PCollectionImpl<?>> iter = nodePath.iterator();
- iter.next(); // prime this past the initial NGroupedTableImpl
- iters.add(iter);
- }
-
- // Find the lowest point w/the lowest cost to be the split point for
- // all of the dependent paths.
- boolean end = false;
- int splitIndex = -1;
- while (!end) {
- splitIndex++;
- PCollectionImpl<?> current = null;
- for (Iterator<PCollectionImpl<?>> iter : iters) {
- if (iter.hasNext()) {
- PCollectionImpl<?> next = iter.next();
- if (next instanceof PGroupedTableImpl) {
- end = true;
- break;
- } else if (current == null) {
- current = next;
- } else if (current != next) {
- end = true;
- break;
- }
- } else {
- end = true;
- break;
- }
- }
- }
- // TODO: Add costing calcs here.
-
- return Iterables.getFirst(paths, null).get(splitIndex);
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !(other instanceof Edge)) {
- return false;
- }
- Edge e = (Edge) other;
- return head.equals(e.head) && tail.equals(e.tail) && paths.equals(e.paths);
- }
-
- @Override
- public int hashCode() {
- return new HashCodeBuilder().append(head).append(tail).toHashCode();
- }
-
- @Override
- public String toString() {
- return ReflectionToStringBuilder.toString(this, ToStringStyle.SHORT_PREFIX_STYLE);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/plan/Graph.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/Graph.java b/crunch/src/main/java/org/apache/crunch/impl/mr/plan/Graph.java
deleted file mode 100644
index ce0a847..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/Graph.java
+++ /dev/null
@@ -1,133 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.plan;
-
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.impl.mr.collect.PCollectionImpl;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
-
-/**
- *
- */
-class Graph implements Iterable<Vertex> {
-
- private final Map<PCollectionImpl, Vertex> vertices;
- private final Map<Pair<Vertex, Vertex>, Edge> edges;
- private final Map<Vertex, List<Vertex>> dependencies;
-
- public Graph() {
- this.vertices = Maps.newHashMap();
- this.edges = Maps.newHashMap();
- this.dependencies = Maps.newHashMap();
- }
-
- public Vertex getVertexAt(PCollectionImpl impl) {
- return vertices.get(impl);
- }
-
- public Vertex addVertex(PCollectionImpl impl, boolean output) {
- if (vertices.containsKey(impl)) {
- Vertex v = vertices.get(impl);
- if (output) {
- v.setOutput();
- }
- return v;
- }
- Vertex v = new Vertex(impl);
- vertices.put(impl, v);
- if (output) {
- v.setOutput();
- }
- return v;
- }
-
- public Edge getEdge(Vertex head, Vertex tail) {
- Pair<Vertex, Vertex> p = Pair.of(head, tail);
- if (edges.containsKey(p)) {
- return edges.get(p);
- }
-
- Edge e = new Edge(head, tail);
- edges.put(p, e);
- tail.addIncoming(e);
- head.addOutgoing(e);
- return e;
- }
-
- @Override
- public Iterator<Vertex> iterator() {
- return Sets.newHashSet(vertices.values()).iterator();
- }
-
- public Set<Edge> getAllEdges() {
- return Sets.newHashSet(edges.values());
- }
-
- public void markDependency(Vertex child, Vertex parent) {
- List<Vertex> parents = dependencies.get(child);
- if (parents == null) {
- parents = Lists.newArrayList();
- dependencies.put(child, parents);
- }
- parents.add(parent);
- }
-
- public List<Vertex> getParents(Vertex child) {
- if (dependencies.containsKey(child)) {
- return dependencies.get(child);
- }
- return ImmutableList.of();
- }
-
- public List<List<Vertex>> connectedComponents() {
- List<List<Vertex>> components = Lists.newArrayList();
- Set<Vertex> unassigned = Sets.newHashSet(vertices.values());
- while (!unassigned.isEmpty()) {
- Vertex base = unassigned.iterator().next();
- List<Vertex> component = Lists.newArrayList();
- component.add(base);
- unassigned.remove(base);
- Set<Vertex> working = Sets.newHashSet(base.getAllNeighbors());
- while (!working.isEmpty()) {
- Vertex n = working.iterator().next();
- working.remove(n);
- if (unassigned.contains(n)) {
- component.add(n);
- unassigned.remove(n);
- for (Vertex n2 : n.getAllNeighbors()) {
- if (unassigned.contains(n2)) {
- working.add(n2);
- }
- }
- }
- }
- components.add(component);
- }
-
- return components;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/plan/GraphBuilder.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/GraphBuilder.java b/crunch/src/main/java/org/apache/crunch/impl/mr/plan/GraphBuilder.java
deleted file mode 100644
index 925c39a..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/GraphBuilder.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.plan;
-
-import org.apache.crunch.impl.mr.collect.DoCollectionImpl;
-import org.apache.crunch.impl.mr.collect.DoTableImpl;
-import org.apache.crunch.impl.mr.collect.InputCollection;
-import org.apache.crunch.impl.mr.collect.PCollectionImpl;
-import org.apache.crunch.impl.mr.collect.PGroupedTableImpl;
-import org.apache.crunch.impl.mr.collect.UnionCollection;
-
-/**
- *
- */
-class GraphBuilder implements PCollectionImpl.Visitor {
-
- private Graph graph = new Graph();
- private Vertex workingVertex;
- private NodePath workingPath;
-
- public Graph getGraph() {
- return graph;
- }
-
- public void visitOutput(PCollectionImpl<?> output) {
- workingVertex = graph.addVertex(output, true);
- workingPath = new NodePath();
- output.accept(this);
- }
-
- @Override
- public void visitInputCollection(InputCollection<?> collection) {
- Vertex v = graph.addVertex(collection, false);
- graph.getEdge(v, workingVertex).addNodePath(workingPath.close(collection));
- }
-
- @Override
- public void visitUnionCollection(UnionCollection<?> collection) {
- Vertex baseVertex = workingVertex;
- NodePath basePath = workingPath;
- for (PCollectionImpl<?> parent : collection.getParents()) {
- workingPath = new NodePath(basePath);
- workingVertex = baseVertex;
- processParent(parent);
- }
- }
-
- @Override
- public void visitDoFnCollection(DoCollectionImpl<?> collection) {
- workingPath.push(collection);
- processParent(collection.getOnlyParent());
- }
-
- @Override
- public void visitDoTable(DoTableImpl<?, ?> collection) {
- workingPath.push(collection);
- processParent(collection.getOnlyParent());
- }
-
- @Override
- public void visitGroupedTable(PGroupedTableImpl<?, ?> collection) {
- Vertex v = graph.addVertex(collection, false);
- graph.getEdge(v, workingVertex).addNodePath(workingPath.close(collection));
- workingVertex = v;
- workingPath = new NodePath(collection);
- processParent(collection.getOnlyParent());
- }
-
- private void processParent(PCollectionImpl<?> parent) {
- Vertex v = graph.getVertexAt(parent);
- if (v == null) {
- parent.accept(this);
- } else {
- graph.getEdge(v, workingVertex).addNodePath(workingPath.close(parent));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/plan/JobNameBuilder.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/JobNameBuilder.java b/crunch/src/main/java/org/apache/crunch/impl/mr/plan/JobNameBuilder.java
deleted file mode 100644
index 9ad7300..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/plan/JobNameBuilder.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.plan;
-
-import java.util.List;
-
-import com.google.common.base.Joiner;
-import com.google.common.collect.Lists;
-
-/**
- * Visitor that traverses the {@code DoNode} instances in a job and builds a
- * String that identifies the stages of the pipeline that belong to this job.
- */
-class JobNameBuilder {
-
- private static final Joiner JOINER = Joiner.on("+");
- private static final Joiner CHILD_JOINER = Joiner.on("/");
-
- private String pipelineName;
- List<String> rootStack = Lists.newArrayList();
-
- public JobNameBuilder(final String pipelineName) {
- this.pipelineName = pipelineName;
- }
-
- public void visit(DoNode node) {
- visit(node, rootStack);
- }
-
- public void visit(List<DoNode> nodes) {
- visit(nodes, rootStack);
- }
-
- private void visit(List<DoNode> nodes, List<String> stack) {
- if (nodes.size() == 1) {
- visit(nodes.get(0), stack);
- } else {
- List<String> childStack = Lists.newArrayList();
- for (int i = 0; i < nodes.size(); i++) {
- DoNode node = nodes.get(i);
- List<String> subStack = Lists.newArrayList();
- visit(node, subStack);
- if (!subStack.isEmpty()) {
- childStack.add("[" + JOINER.join(subStack) + "]");
- }
- }
- if (!childStack.isEmpty()) {
- stack.add("[" + CHILD_JOINER.join(childStack) + "]");
- }
- }
- }
-
- private void visit(DoNode node, List<String> stack) {
- String name = node.getName();
- if (!name.isEmpty()) {
- stack.add(node.getName());
- }
- visit(node.getChildren(), stack);
- }
-
- public String build() {
- return String.format("%s: %s", pipelineName, JOINER.join(rootStack));
- }
-}
[35/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/Pipeline.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/Pipeline.java b/crunch-core/src/main/java/org/apache/crunch/Pipeline.java
new file mode 100644
index 0000000..84c720c
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/Pipeline.java
@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Manages the state of a pipeline execution.
+ *
+ */
+public interface Pipeline {
+
+ /**
+ * Set the {@code Configuration} to use with this pipeline.
+ */
+ void setConfiguration(Configuration conf);
+
+ /**
+ * Returns the name of this pipeline.
+ *
+ * @return Name of the pipeline
+ */
+ String getName();
+
+ /**
+ * Returns the {@code Configuration} instance associated with this pipeline.
+ */
+ Configuration getConfiguration();
+
+ /**
+ * Converts the given {@code Source} into a {@code PCollection} that is
+ * available to jobs run using this {@code Pipeline} instance.
+ *
+ * @param source
+ * The source of data
+ * @return A PCollection that references the given source
+ */
+ <T> PCollection<T> read(Source<T> source);
+
+ /**
+ * A version of the read method for {@code TableSource} instances that map to
+ * {@code PTable}s.
+ *
+ * @param tableSource
+ * The source of the data
+ * @return A PTable that references the given source
+ */
+ <K, V> PTable<K, V> read(TableSource<K, V> tableSource);
+
+ /**
+ * Write the given collection to the given target on the next pipeline run. The
+ * system will check to see if the target's location already exists using the
+ * {@code WriteMode.DEFAULT} rule for the given {@code Target}.
+ *
+ * @param collection
+ * The collection
+ * @param target
+ * The output target
+ */
+ void write(PCollection<?> collection, Target target);
+
+ /**
+ * Write the contents of the {@code PCollection} to the given {@code Target},
+ * using the storage format specified by the target and the given
+ * {@code WriteMode} for cases where the referenced {@code Target}
+ * already exists.
+ *
+ * @param collection
+ * The collection
+ * @param target
+ * The target to write to
+ * @param writeMode
+ * The strategy to use for handling existing outputs
+ */
+ void write(PCollection<?> collection, Target target,
+ Target.WriteMode writeMode);
+
+ /**
+ * Create the given PCollection and read the data it contains into the
+ * returned Collection instance for client use.
+ *
+ * @param pcollection
+ * The PCollection to materialize
+ * @return the data from the PCollection as a read-only Collection
+ */
+ <T> Iterable<T> materialize(PCollection<T> pcollection);
+
+ /**
+ * Constructs and executes a series of MapReduce jobs in order to write data
+ * to the output targets.
+ */
+ PipelineResult run();
+
+ /**
+ * Constructs and starts a series of MapReduce jobs in order ot write data to
+ * the output targets, but returns a {@code ListenableFuture} to allow clients to control
+ * job execution.
+ * @return
+ */
+ PipelineExecution runAsync();
+
+ /**
+ * Run any remaining jobs required to generate outputs and then clean up any
+ * intermediate data files that were created in this run or previous calls to
+ * {@code run}.
+ */
+ PipelineResult done();
+
+ /**
+ * A convenience method for reading a text file.
+ */
+ PCollection<String> readTextFile(String pathName);
+
+ /**
+ * A convenience method for writing a text file.
+ */
+ <T> void writeTextFile(PCollection<T> collection, String pathName);
+
+ /**
+ * Turn on debug logging for jobs that are run from this pipeline.
+ */
+ void enableDebug();
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/PipelineExecution.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/PipelineExecution.java b/crunch-core/src/main/java/org/apache/crunch/PipelineExecution.java
new file mode 100644
index 0000000..fc6bb91
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/PipelineExecution.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * A handle to allow clients to control a Crunch pipeline as it runs.
+ *
+ * This interface is thread-safe.
+ */
+public interface PipelineExecution {
+
+ enum Status { READY, RUNNING, SUCCEEDED, FAILED, KILLED }
+
+ /** Returns the .dot file that allows a client to graph the Crunch execution plan for this
+ * pipeline.
+ */
+ String getPlanDotFile();
+
+ /** Blocks until pipeline completes or the specified waiting time elapsed. */
+ void waitFor(long timeout, TimeUnit timeUnit) throws InterruptedException;
+
+ /** Blocks until pipeline completes, i.e. {@code SUCCEEDED}, {@code FAILED} or {@code KILLED}. */
+ void waitUntilDone() throws InterruptedException;
+
+ Status getStatus();
+
+ /** Retrieve the result of a pipeline if it has been completed, otherwise {@code null}. */
+ PipelineResult getResult();
+
+ /**
+ * Kills the pipeline if it is running, no-op otherwise.
+ *
+ * This method only delivers a kill signal to the pipeline, and does not guarantee the pipeline exits on return.
+ * To wait for completely exits, use {@link #waitUntilDone()} after this call.
+ */
+ void kill() throws InterruptedException;
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/PipelineResult.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/PipelineResult.java b/crunch-core/src/main/java/org/apache/crunch/PipelineResult.java
new file mode 100644
index 0000000..90b1067
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/PipelineResult.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import java.util.List;
+
+import org.apache.hadoop.mapreduce.Counter;
+import org.apache.hadoop.mapreduce.Counters;
+
+import com.google.common.collect.ImmutableList;
+
+/**
+ * Container for the results of a call to {@code run} or {@code done} on the
+ * Pipeline interface that includes details and statistics about the component
+ * stages of the data pipeline.
+ */
+public class PipelineResult {
+
+ public static class StageResult {
+
+ private final String stageName;
+ private final Counters counters;
+
+ public StageResult(String stageName, Counters counters) {
+ this.stageName = stageName;
+ this.counters = counters;
+ }
+
+ public String getStageName() {
+ return stageName;
+ }
+
+ public Counters getCounters() {
+ return counters;
+ }
+
+ public Counter findCounter(Enum<?> key) {
+ return counters.findCounter(key);
+ }
+
+ public long getCounterValue(Enum<?> key) {
+ return findCounter(key).getValue();
+ }
+ }
+
+ public static final PipelineResult EMPTY = new PipelineResult(ImmutableList.<StageResult> of());
+
+ private final List<StageResult> stageResults;
+
+ public PipelineResult(List<StageResult> stageResults) {
+ this.stageResults = ImmutableList.copyOf(stageResults);
+ }
+
+ public boolean succeeded() {
+ return !stageResults.isEmpty();
+ }
+
+ public List<StageResult> getStageResults() {
+ return stageResults;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/Source.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/Source.java b/crunch-core/src/main/java/org/apache/crunch/Source.java
new file mode 100644
index 0000000..f54d135
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/Source.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import java.io.IOException;
+
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.Job;
+
+/**
+ * A {@code Source} represents an input data set that is an input to one or more
+ * MapReduce jobs.
+ *
+ */
+public interface Source<T> {
+ /**
+ * Returns the {@code PType} for this source.
+ */
+ PType<T> getType();
+
+ /**
+ * Configure the given job to use this source as an input.
+ *
+ * @param job
+ * The job to configure
+ * @param inputId
+ * For a multi-input job, an identifier for this input to the job
+ * @throws IOException
+ */
+ void configureSource(Job job, int inputId) throws IOException;
+
+ /**
+ * Returns the number of bytes in this {@code Source}.
+ */
+ long getSize(Configuration configuration);
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/SourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/SourceTarget.java b/crunch-core/src/main/java/org/apache/crunch/SourceTarget.java
new file mode 100644
index 0000000..09c03c6
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/SourceTarget.java
@@ -0,0 +1,26 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+/**
+ * An interface for classes that implement both the {@code Source} and the
+ * {@code Target} interfaces.
+ *
+ */
+public interface SourceTarget<T> extends Source<T>, Target {
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/TableSource.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/TableSource.java b/crunch-core/src/main/java/org/apache/crunch/TableSource.java
new file mode 100644
index 0000000..ff27346
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/TableSource.java
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import org.apache.crunch.types.PTableType;
+
+/**
+ * The interface {@code Source} implementations that return a {@link PTable}.
+ *
+ */
+public interface TableSource<K, V> extends Source<Pair<K, V>> {
+ PTableType<K, V> getTableType();
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/TableSourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/TableSourceTarget.java b/crunch-core/src/main/java/org/apache/crunch/TableSourceTarget.java
new file mode 100644
index 0000000..9b1ed34
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/TableSourceTarget.java
@@ -0,0 +1,25 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+/**
+ * An interface for classes that implement both the {@code TableSource} and the
+ * {@code Target} interfaces.
+ */
+public interface TableSourceTarget<K, V> extends TableSource<K, V>, SourceTarget<Pair<K, V>> {
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/Target.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/Target.java b/crunch-core/src/main/java/org/apache/crunch/Target.java
new file mode 100644
index 0000000..0a0c23d
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/Target.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import org.apache.crunch.io.OutputHandler;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * A {@code Target} represents the output destination of a Crunch {@code PCollection}
+ * in the context of a Crunch job.
+ */
+public interface Target {
+
+ /**
+ * An enum to represent different options the client may specify
+ * for handling the case where the output path, table, etc. referenced
+ * by a {@code Target} already exists.
+ */
+ enum WriteMode {
+ /**
+ * Check to see if the output target already exists before running
+ * the pipeline, and if it does, print an error and throw an exception.
+ */
+ DEFAULT,
+
+ /**
+ * Check to see if the output target already exists, and if it does,
+ * delete it and overwrite it with the new output (if any).
+ */
+ OVERWRITE,
+
+ /**
+ * If the output target does not exist, create it. If it does exist,
+ * add the output of this pipeline to the target. This was the
+ * behavior in Crunch up to version 0.4.0.
+ */
+ APPEND
+ }
+
+ /**
+ * Apply the given {@code WriteMode} to this {@code Target} instance.
+ *
+ * @param writeMode The strategy for handling existing outputs
+ * @param conf The ever-useful {@code Configuration} instance
+ */
+ void handleExisting(WriteMode writeMode, Configuration conf);
+
+ /**
+ * Checks to see if this {@code Target} instance is compatible with the
+ * given {@code PType}.
+ *
+ * @param handler The {@link OutputHandler} that is managing the output for the job
+ * @param ptype The {@code PType} to check
+ * @return True if this Target can write data in the form of the given {@code PType},
+ * false otherwise
+ */
+ boolean accept(OutputHandler handler, PType<?> ptype);
+
+ /**
+ * Attempt to create the {@code SourceTarget} type that corresponds to this {@code Target}
+ * for the given {@code PType}, if possible. If it is not possible, return {@code null}.
+ *
+ * @param ptype The {@code PType} to use in constructing the {@code SourceTarget}
+ * @return A new {@code SourceTarget} or null if such a {@code SourceTarget} does not exist
+ */
+ <T> SourceTarget<T> asSourceTarget(PType<T> ptype);
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/Tuple.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/Tuple.java b/crunch-core/src/main/java/org/apache/crunch/Tuple.java
new file mode 100644
index 0000000..4e602ff
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/Tuple.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+/**
+ * A fixed-size collection of Objects, used in Crunch for representing joins
+ * between {@code PCollection}s.
+ *
+ */
+public interface Tuple {
+
+ /**
+ * Returns the Object at the given index.
+ */
+ Object get(int index);
+
+ /**
+ * Returns the number of elements in this Tuple.
+ */
+ int size();
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/Tuple3.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/Tuple3.java b/crunch-core/src/main/java/org/apache/crunch/Tuple3.java
new file mode 100644
index 0000000..4372811
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/Tuple3.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+
+/**
+ * A convenience class for three-element {@link Tuple}s.
+ */
+public class Tuple3<V1, V2, V3> implements Tuple {
+
+ private final V1 first;
+ private final V2 second;
+ private final V3 third;
+
+ public static <A, B, C> Tuple3<A, B, C> of(A a, B b, C c) {
+ return new Tuple3<A, B, C>(a, b, c);
+ }
+
+ public Tuple3(V1 first, V2 second, V3 third) {
+ this.first = first;
+ this.second = second;
+ this.third = third;
+ }
+
+ public V1 first() {
+ return first;
+ }
+
+ public V2 second() {
+ return second;
+ }
+
+ public V3 third() {
+ return third;
+ }
+
+ public Object get(int index) {
+ switch (index) {
+ case 0:
+ return first;
+ case 1:
+ return second;
+ case 2:
+ return third;
+ default:
+ throw new ArrayIndexOutOfBoundsException();
+ }
+ }
+
+ public int size() {
+ return 3;
+ }
+
+ @Override
+ public int hashCode() {
+ HashCodeBuilder hcb = new HashCodeBuilder();
+ return hcb.append(first).append(second).append(third).toHashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ Tuple3<?, ?, ?> other = (Tuple3<?, ?, ?>) obj;
+ return (first == other.first || (first != null && first.equals(other.first)))
+ && (second == other.second || (second != null && second.equals(other.second)))
+ && (third == other.third || (third != null && third.equals(other.third)));
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder("Tuple3[");
+ sb.append(first).append(",").append(second).append(",").append(third);
+ return sb.append("]").toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/Tuple4.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/Tuple4.java b/crunch-core/src/main/java/org/apache/crunch/Tuple4.java
new file mode 100644
index 0000000..f161371
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/Tuple4.java
@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+
+/**
+ * A convenience class for four-element {@link Tuple}s.
+ */
+public class Tuple4<V1, V2, V3, V4> implements Tuple {
+
+ private final V1 first;
+ private final V2 second;
+ private final V3 third;
+ private final V4 fourth;
+
+ public static <A, B, C, D> Tuple4<A, B, C, D> of(A a, B b, C c, D d) {
+ return new Tuple4<A, B, C, D>(a, b, c, d);
+ }
+
+ public Tuple4(V1 first, V2 second, V3 third, V4 fourth) {
+ this.first = first;
+ this.second = second;
+ this.third = third;
+ this.fourth = fourth;
+ }
+
+ public V1 first() {
+ return first;
+ }
+
+ public V2 second() {
+ return second;
+ }
+
+ public V3 third() {
+ return third;
+ }
+
+ public V4 fourth() {
+ return fourth;
+ }
+
+ public Object get(int index) {
+ switch (index) {
+ case 0:
+ return first;
+ case 1:
+ return second;
+ case 2:
+ return third;
+ case 3:
+ return fourth;
+ default:
+ throw new ArrayIndexOutOfBoundsException();
+ }
+ }
+
+ public int size() {
+ return 4;
+ }
+
+ @Override
+ public int hashCode() {
+ HashCodeBuilder hcb = new HashCodeBuilder();
+ return hcb.append(first).append(second).append(third).append(fourth).toHashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ Tuple4<?, ?, ?, ?> other = (Tuple4<?, ?, ?, ?>) obj;
+ return (first == other.first || (first != null && first.equals(other.first)))
+ && (second == other.second || (second != null && second.equals(other.second)))
+ && (third == other.third || (third != null && third.equals(other.third)))
+ && (fourth == other.fourth || (fourth != null && fourth.equals(other.fourth)));
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder("Tuple4[");
+ sb.append(first).append(",").append(second).append(",").append(third);
+ return sb.append(",").append(fourth).append("]").toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/TupleN.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/TupleN.java b/crunch-core/src/main/java/org/apache/crunch/TupleN.java
new file mode 100644
index 0000000..e5eceb5
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/TupleN.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import java.util.Arrays;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+
+/**
+ * A {@link Tuple} instance for an arbitrary number of values.
+ */
+public class TupleN implements Tuple {
+
+ private final Object values[];
+
+ public static TupleN of(Object... values) {
+ return new TupleN(values);
+ }
+
+ public TupleN(Object... values) {
+ this.values = new Object[values.length];
+ System.arraycopy(values, 0, this.values, 0, values.length);
+ }
+
+ public Object get(int index) {
+ return values[index];
+ }
+
+ public int size() {
+ return values.length;
+ }
+
+ @Override
+ public int hashCode() {
+ HashCodeBuilder hcb = new HashCodeBuilder();
+ for (Object v : values) {
+ hcb.append(v);
+ }
+ return hcb.toHashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ TupleN other = (TupleN) obj;
+ return Arrays.equals(this.values, other.values);
+ }
+
+ @Override
+ public String toString() {
+ return Arrays.toString(values);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/fn/Aggregators.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/fn/Aggregators.java b/crunch-core/src/main/java/org/apache/crunch/fn/Aggregators.java
new file mode 100644
index 0000000..0ac79e2
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/fn/Aggregators.java
@@ -0,0 +1,1111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import java.math.BigInteger;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+import java.util.SortedSet;
+
+import org.apache.crunch.Aggregator;
+import org.apache.crunch.CombineFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PGroupedTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Tuple;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+import org.apache.crunch.util.Tuples;
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+
+/**
+ * A collection of pre-defined {@link org.apache.crunch.Aggregator}s.
+ *
+ * <p>The factory methods of this class return {@link org.apache.crunch.Aggregator}
+ * instances that you can use to combine the values of a {@link PGroupedTable}.
+ * In most cases, they turn a multimap (multiple entries per key) into a map (one
+ * entry per key).</p>
+ *
+ * <p><strong>Note</strong>: When using composed aggregators, like those built by the
+ * {@link #pairAggregator(Aggregator, Aggregator) pairAggregator()}
+ * factory method, you typically don't want to put in the same child aggregator more than once,
+ * even if all child aggregators have the same type. In most cases, this is what you want:</p>
+ *
+ * <pre>
+ * PTable<K, Long> result = groupedTable.combineValues(
+ * pairAggregator(SUM_LONGS(), SUM_LONGS())
+ * );
+ * </pre>
+ */
+public final class Aggregators {
+
+ private Aggregators() {
+ // utility class, not for instantiation
+ }
+
+ /**
+ * Sum up all {@code long} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Long> SUM_LONGS() {
+ return new SumLongs();
+ }
+
+ /**
+ * Sum up all {@code int} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Integer> SUM_INTS() {
+ return new SumInts();
+ }
+
+ /**
+ * Sum up all {@code float} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Float> SUM_FLOATS() {
+ return new SumFloats();
+ }
+
+ /**
+ * Sum up all {@code double} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Double> SUM_DOUBLES() {
+ return new SumDoubles();
+ }
+
+ /**
+ * Sum up all {@link BigInteger} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<BigInteger> SUM_BIGINTS() {
+ return new SumBigInts();
+ }
+
+ /**
+ * Return the maximum of all given {@code long} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Long> MAX_LONGS() {
+ return new MaxLongs();
+ }
+
+ /**
+ * Return the {@code n} largest {@code long} values (or fewer if there are fewer
+ * values than {@code n}).
+ * @param n The number of values to return
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Long> MAX_LONGS(int n) {
+ return new MaxLongs();
+ }
+
+ /**
+ * Return the maximum of all given {@code int} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Integer> MAX_INTS() {
+ return new MaxInts();
+ }
+
+ /**
+ * Return the {@code n} largest {@code int} values (or fewer if there are fewer
+ * values than {@code n}).
+ * @param n The number of values to return
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Integer> MAX_INTS(int n) {
+ return new MaxNAggregator<Integer>(n);
+ }
+
+ /**
+ * Return the maximum of all given {@code float} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Float> MAX_FLOATS() {
+ return new MaxFloats();
+ }
+
+ /**
+ * Return the {@code n} largest {@code float} values (or fewer if there are fewer
+ * values than {@code n}).
+ * @param n The number of values to return
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Float> MAX_FLOATS(int n) {
+ return new MaxNAggregator<Float>(n);
+ }
+
+ /**
+ * Return the maximum of all given {@code double} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Double> MAX_DOUBLES() {
+ return new MaxDoubles();
+ }
+
+ /**
+ * Return the {@code n} largest {@code double} values (or fewer if there are fewer
+ * values than {@code n}).
+ * @param n The number of values to return
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Double> MAX_DOUBLES(int n) {
+ return new MaxNAggregator<Double>(n);
+ }
+
+ /**
+ * Return the maximum of all given {@link BigInteger} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<BigInteger> MAX_BIGINTS() {
+ return new MaxBigInts();
+ }
+
+ /**
+ * Return the {@code n} largest {@link BigInteger} values (or fewer if there are fewer
+ * values than {@code n}).
+ * @param n The number of values to return
+ * @return The newly constructed instance
+ */
+ public static Aggregator<BigInteger> MAX_BIGINTS(int n) {
+ return new MaxNAggregator<BigInteger>(n);
+ }
+
+ /**
+ * Return the {@code n} largest values (or fewer if there are fewer
+ * values than {@code n}).
+ * @param n The number of values to return
+ * @param cls The type of the values to aggregate (must implement {@link Comparable}!)
+ * @return The newly constructed instance
+ */
+ public static <V extends Comparable<V>> Aggregator<V> MAX_N(int n, Class<V> cls) {
+ return new MaxNAggregator<V>(n);
+ }
+
+ /**
+ * Return the minimum of all given {@code long} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Long> MIN_LONGS() {
+ return new MinLongs();
+ }
+
+ /**
+ * Return the {@code n} smallest {@code long} values (or fewer if there are fewer
+ * values than {@code n}).
+ * @param n The number of values to return
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Long> MIN_LONGS(int n) {
+ return new MinNAggregator<Long>(n);
+ }
+
+ /**
+ * Return the minimum of all given {@code int} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Integer> MIN_INTS() {
+ return new MinInts();
+ }
+
+ /**
+ * Return the {@code n} smallest {@code int} values (or fewer if there are fewer
+ * values than {@code n}).
+ * @param n The number of values to return
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Integer> MIN_INTS(int n) {
+ return new MinNAggregator<Integer>(n);
+ }
+
+ /**
+ * Return the minimum of all given {@code float} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Float> MIN_FLOATS() {
+ return new MinFloats();
+ }
+
+ /**
+ * Return the {@code n} smallest {@code float} values (or fewer if there are fewer
+ * values than {@code n}).
+ * @param n The number of values to return
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Float> MIN_FLOATS(int n) {
+ return new MinNAggregator<Float>(n);
+ }
+
+ /**
+ * Return the minimum of all given {@code double} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Double> MIN_DOUBLES() {
+ return new MinDoubles();
+ }
+
+ /**
+ * Return the {@code n} smallest {@code double} values (or fewer if there are fewer
+ * values than {@code n}).
+ * @param n The number of values to return
+ * @return The newly constructed instance
+ */
+ public static Aggregator<Double> MIN_DOUBLES(int n) {
+ return new MinNAggregator<Double>(n);
+ }
+
+ /**
+ * Return the minimum of all given {@link BigInteger} values.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<BigInteger> MIN_BIGINTS() {
+ return new MinBigInts();
+ }
+
+ /**
+ * Return the {@code n} smallest {@link BigInteger} values (or fewer if there are fewer
+ * values than {@code n}).
+ * @param n The number of values to return
+ * @return The newly constructed instance
+ */
+ public static Aggregator<BigInteger> MIN_BIGINTS(int n) {
+ return new MinNAggregator<BigInteger>(n);
+ }
+
+ /**
+ * Return the {@code n} smallest values (or fewer if there are fewer
+ * values than {@code n}).
+ * @param n The number of values to return
+ * @param cls The type of the values to aggregate (must implement {@link Comparable}!)
+ * @return The newly constructed instance
+ */
+ public static <V extends Comparable<V>> Aggregator<V> MIN_N(int n, Class<V> cls) {
+ return new MinNAggregator<V>(n);
+ }
+
+ /**
+ * Return the first {@code n} values (or fewer if there are fewer values than {@code n}).
+ *
+ * @param n The number of values to return
+ * @return The newly constructed instance
+ */
+ public static <V> Aggregator<V> FIRST_N(int n) {
+ return new FirstNAggregator<V>(n);
+ }
+
+ /**
+ * Return the last {@code n} values (or fewer if there are fewer values than {@code n}).
+ *
+ * @param n The number of values to return
+ * @return The newly constructed instance
+ */
+ public static <V> Aggregator<V> LAST_N(int n) {
+ return new LastNAggregator<V>(n);
+ }
+
+ /**
+ * Concatenate strings, with a separator between strings. There
+ * is no limits of length for the concatenated string.
+ *
+ * <p><em>Note: String concatenation is not commutative, which means the
+ * result of the aggregation is not deterministic!</em></p>
+ *
+ * @param separator
+ * the separator which will be appended between each string
+ * @param skipNull
+ * define if we should skip null values. Throw
+ * NullPointerException if set to false and there is a null
+ * value.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<String> STRING_CONCAT(String separator, boolean skipNull) {
+ return new StringConcatAggregator(separator, skipNull);
+ }
+
+ /**
+ * Concatenate strings, with a separator between strings. You can specify
+ * the maximum length of the output string and of the input strings, if
+ * they are > 0. If a value is <= 0, there is no limit.
+ *
+ * <p>Any too large string (or any string which would made the output too
+ * large) will be silently discarded.</p>
+ *
+ * <p><em>Note: String concatenation is not commutative, which means the
+ * result of the aggregation is not deterministic!</em></p>
+ *
+ * @param separator
+ * the separator which will be appended between each string
+ * @param skipNull
+ * define if we should skip null values. Throw
+ * NullPointerException if set to false and there is a null
+ * value.
+ * @param maxOutputLength
+ * the maximum length of the output string. If it's set <= 0,
+ * there is no limit. The number of characters of the output
+ * string will be < maxOutputLength.
+ * @param maxInputLength
+ * the maximum length of the input strings. If it's set <= 0,
+ * there is no limit. The number of characters of the input string
+ * will be < maxInputLength to be concatenated.
+ * @return The newly constructed instance
+ */
+ public static Aggregator<String> STRING_CONCAT(String separator, boolean skipNull,
+ long maxOutputLength, long maxInputLength) {
+ return new StringConcatAggregator(separator, skipNull, maxOutputLength, maxInputLength);
+ }
+
+ /**
+ * Collect the unique elements of the input, as defined by the {@code equals} method for
+ * the input objects. No guarantees are made about the order in which the final elements
+ * will be returned.
+ *
+ * @return The newly constructed instance
+ */
+ public static <V> Aggregator<V> UNIQUE_ELEMENTS() {
+ return new SetAggregator<V>();
+ }
+
+ /**
+ * Collect a sample of unique elements from the input, where 'unique' is defined by
+ * the {@code equals} method for the input objects. No guarantees are made about which
+ * elements will be returned, simply that there will not be any more than the given sample
+ * size for any key.
+ *
+ * @param maximumSampleSize The maximum number of unique elements to return per key
+ * @return The newly constructed instance
+ */
+ public static <V> Aggregator<V> SAMPLE_UNIQUE_ELEMENTS(int maximumSampleSize) {
+ return new SetAggregator<V>(maximumSampleSize);
+ }
+
+ /**
+ * Apply separate aggregators to each component of a {@link Pair}.
+ */
+ public static <V1, V2> Aggregator<Pair<V1, V2>> pairAggregator(
+ Aggregator<V1> a1, Aggregator<V2> a2) {
+ return new PairAggregator<V1, V2>(a1, a2);
+ }
+
+ /**
+ * Apply separate aggregators to each component of a {@link Tuple3}.
+ */
+ public static <V1, V2, V3> Aggregator<Tuple3<V1, V2, V3>> tripAggregator(
+ Aggregator<V1> a1, Aggregator<V2> a2, Aggregator<V3> a3) {
+ return new TripAggregator<V1, V2, V3>(a1, a2, a3);
+ }
+
+ /**
+ * Apply separate aggregators to each component of a {@link Tuple4}.
+ */
+ public static <V1, V2, V3, V4> Aggregator<Tuple4<V1, V2, V3, V4>> quadAggregator(
+ Aggregator<V1> a1, Aggregator<V2> a2, Aggregator<V3> a3, Aggregator<V4> a4) {
+ return new QuadAggregator<V1, V2, V3, V4>(a1, a2, a3, a4);
+ }
+
+ /**
+ * Apply separate aggregators to each component of a {@link Tuple}.
+ */
+ public static Aggregator<TupleN> tupleAggregator(Aggregator<?>... aggregators) {
+ return new TupleNAggregator(aggregators);
+ }
+
+ /**
+ * Wrap a {@link CombineFn} adapter around the given aggregator.
+ *
+ * @param aggregator The instance to wrap
+ * @return A {@link CombineFn} delegating to {@code aggregator}
+ */
+ public static final <K, V> CombineFn<K, V> toCombineFn(Aggregator<V> aggregator) {
+ return new AggregatorCombineFn<K, V>(aggregator);
+ }
+
+ /**
+ * Base class for aggregators that do not require any initialization.
+ */
+ public static abstract class SimpleAggregator<T> implements Aggregator<T> {
+ @Override
+ public void initialize(Configuration conf) {
+ // No-op
+ }
+ }
+
+ /**
+ * A {@code CombineFn} that delegates all of the actual work to an
+ * {@code Aggregator} instance.
+ */
+ private static class AggregatorCombineFn<K, V> extends CombineFn<K, V> {
+ // TODO: Has to be fully qualified until CombineFn.Aggregator can be removed.
+ private final org.apache.crunch.Aggregator<V> aggregator;
+
+ public AggregatorCombineFn(org.apache.crunch.Aggregator<V> aggregator) {
+ this.aggregator = aggregator;
+ }
+
+ @Override
+ public void initialize() {
+ aggregator.initialize(getConfiguration());
+ }
+
+ @Override
+ public void process(Pair<K, Iterable<V>> input, Emitter<Pair<K, V>> emitter) {
+ aggregator.reset();
+ for (V v : input.second()) {
+ aggregator.update(v);
+ }
+ for (V v : aggregator.results()) {
+ emitter.emit(Pair.of(input.first(), v));
+ }
+ }
+ }
+
+ private static class SumLongs extends SimpleAggregator<Long> {
+ private long sum = 0;
+
+ @Override
+ public void reset() {
+ sum = 0;
+ }
+
+ @Override
+ public void update(Long next) {
+ sum += next;
+ }
+
+ @Override
+ public Iterable<Long> results() {
+ return ImmutableList.of(sum);
+ }
+ }
+
+ private static class SumInts extends SimpleAggregator<Integer> {
+ private int sum = 0;
+
+ @Override
+ public void reset() {
+ sum = 0;
+ }
+
+ @Override
+ public void update(Integer next) {
+ sum += next;
+ }
+
+ @Override
+ public Iterable<Integer> results() {
+ return ImmutableList.of(sum);
+ }
+ }
+
+ private static class SumFloats extends SimpleAggregator<Float> {
+ private float sum = 0;
+
+ @Override
+ public void reset() {
+ sum = 0f;
+ }
+
+ @Override
+ public void update(Float next) {
+ sum += next;
+ }
+
+ @Override
+ public Iterable<Float> results() {
+ return ImmutableList.of(sum);
+ }
+ }
+
+ private static class SumDoubles extends SimpleAggregator<Double> {
+ private double sum = 0;
+
+ @Override
+ public void reset() {
+ sum = 0f;
+ }
+
+ @Override
+ public void update(Double next) {
+ sum += next;
+ }
+
+ @Override
+ public Iterable<Double> results() {
+ return ImmutableList.of(sum);
+ }
+ }
+
+ private static class SumBigInts extends SimpleAggregator<BigInteger> {
+ private BigInteger sum = BigInteger.ZERO;
+
+ @Override
+ public void reset() {
+ sum = BigInteger.ZERO;
+ }
+
+ @Override
+ public void update(BigInteger next) {
+ sum = sum.add(next);
+ }
+
+ @Override
+ public Iterable<BigInteger> results() {
+ return ImmutableList.of(sum);
+ }
+ }
+
+ private static class MaxLongs extends SimpleAggregator<Long> {
+ private Long max = null;
+
+ @Override
+ public void reset() {
+ max = null;
+ }
+
+ @Override
+ public void update(Long next) {
+ if (max == null || max < next) {
+ max = next;
+ }
+ }
+
+ @Override
+ public Iterable<Long> results() {
+ return ImmutableList.of(max);
+ }
+ }
+
+ private static class MaxInts extends SimpleAggregator<Integer> {
+ private Integer max = null;
+
+ @Override
+ public void reset() {
+ max = null;
+ }
+
+ @Override
+ public void update(Integer next) {
+ if (max == null || max < next) {
+ max = next;
+ }
+ }
+
+ @Override
+ public Iterable<Integer> results() {
+ return ImmutableList.of(max);
+ }
+ }
+
+ private static class MaxFloats extends SimpleAggregator<Float> {
+ private Float max = null;
+
+ @Override
+ public void reset() {
+ max = null;
+ }
+
+ @Override
+ public void update(Float next) {
+ if (max == null || max < next) {
+ max = next;
+ }
+ }
+
+ @Override
+ public Iterable<Float> results() {
+ return ImmutableList.of(max);
+ }
+ }
+
+ private static class MaxDoubles extends SimpleAggregator<Double> {
+ private Double max = null;
+
+ @Override
+ public void reset() {
+ max = null;
+ }
+
+ @Override
+ public void update(Double next) {
+ if (max == null || max < next) {
+ max = next;
+ }
+ }
+
+ @Override
+ public Iterable<Double> results() {
+ return ImmutableList.of(max);
+ }
+ }
+
+ private static class MaxBigInts extends SimpleAggregator<BigInteger> {
+ private BigInteger max = null;
+
+ @Override
+ public void reset() {
+ max = null;
+ }
+
+ @Override
+ public void update(BigInteger next) {
+ if (max == null || max.compareTo(next) < 0) {
+ max = next;
+ }
+ }
+
+ @Override
+ public Iterable<BigInteger> results() {
+ return ImmutableList.of(max);
+ }
+ }
+
+ private static class MinLongs extends SimpleAggregator<Long> {
+ private Long min = null;
+
+ @Override
+ public void reset() {
+ min = null;
+ }
+
+ @Override
+ public void update(Long next) {
+ if (min == null || min > next) {
+ min = next;
+ }
+ }
+
+ @Override
+ public Iterable<Long> results() {
+ return ImmutableList.of(min);
+ }
+ }
+
+ private static class MinInts extends SimpleAggregator<Integer> {
+ private Integer min = null;
+
+ @Override
+ public void reset() {
+ min = null;
+ }
+
+ @Override
+ public void update(Integer next) {
+ if (min == null || min > next) {
+ min = next;
+ }
+ }
+
+ @Override
+ public Iterable<Integer> results() {
+ return ImmutableList.of(min);
+ }
+ }
+
+ private static class MinFloats extends SimpleAggregator<Float> {
+ private Float min = null;
+
+ @Override
+ public void reset() {
+ min = null;
+ }
+
+ @Override
+ public void update(Float next) {
+ if (min == null || min > next) {
+ min = next;
+ }
+ }
+
+ @Override
+ public Iterable<Float> results() {
+ return ImmutableList.of(min);
+ }
+ }
+
+ private static class MinDoubles extends SimpleAggregator<Double> {
+ private Double min = null;
+
+ @Override
+ public void reset() {
+ min = null;
+ }
+
+ @Override
+ public void update(Double next) {
+ if (min == null || min > next) {
+ min = next;
+ }
+ }
+
+ @Override
+ public Iterable<Double> results() {
+ return ImmutableList.of(min);
+ }
+ }
+
+ private static class MinBigInts extends SimpleAggregator<BigInteger> {
+ private BigInteger min = null;
+
+ @Override
+ public void reset() {
+ min = null;
+ }
+
+ @Override
+ public void update(BigInteger next) {
+ if (min == null || min.compareTo(next) > 0) {
+ min = next;
+ }
+ }
+
+ @Override
+ public Iterable<BigInteger> results() {
+ return ImmutableList.of(min);
+ }
+ }
+
+ private static class MaxNAggregator<V extends Comparable<V>> extends SimpleAggregator<V> {
+ private final int arity;
+ private transient SortedSet<V> elements;
+
+ public MaxNAggregator(int arity) {
+ this.arity = arity;
+ }
+
+ @Override
+ public void reset() {
+ if (elements == null) {
+ elements = Sets.newTreeSet();
+ } else {
+ elements.clear();
+ }
+ }
+
+ @Override
+ public void update(V value) {
+ if (elements.size() < arity) {
+ elements.add(value);
+ } else if (value.compareTo(elements.first()) > 0) {
+ elements.remove(elements.first());
+ elements.add(value);
+ }
+ }
+
+ @Override
+ public Iterable<V> results() {
+ return ImmutableList.copyOf(elements);
+ }
+ }
+
+ private static class MinNAggregator<V extends Comparable<V>> extends SimpleAggregator<V> {
+ private final int arity;
+ private transient SortedSet<V> elements;
+
+ public MinNAggregator(int arity) {
+ this.arity = arity;
+ }
+
+ @Override
+ public void reset() {
+ if (elements == null) {
+ elements = Sets.newTreeSet();
+ } else {
+ elements.clear();
+ }
+ }
+
+ @Override
+ public void update(V value) {
+ if (elements.size() < arity) {
+ elements.add(value);
+ } else if (value.compareTo(elements.last()) < 0) {
+ elements.remove(elements.last());
+ elements.add(value);
+ }
+ }
+
+ @Override
+ public Iterable<V> results() {
+ return ImmutableList.copyOf(elements);
+ }
+ }
+
+ private static class FirstNAggregator<V> extends SimpleAggregator<V> {
+ private final int arity;
+ private final List<V> elements;
+
+ public FirstNAggregator(int arity) {
+ this.arity = arity;
+ this.elements = Lists.newArrayList();
+ }
+
+ @Override
+ public void reset() {
+ elements.clear();
+ }
+
+ @Override
+ public void update(V value) {
+ if (elements.size() < arity) {
+ elements.add(value);
+ }
+ }
+
+ @Override
+ public Iterable<V> results() {
+ return ImmutableList.copyOf(elements);
+ }
+ }
+
+ private static class LastNAggregator<V> extends SimpleAggregator<V> {
+ private final int arity;
+ private final LinkedList<V> elements;
+
+ public LastNAggregator(int arity) {
+ this.arity = arity;
+ this.elements = Lists.newLinkedList();
+ }
+
+ @Override
+ public void reset() {
+ elements.clear();
+ }
+
+ @Override
+ public void update(V value) {
+ elements.add(value);
+ if (elements.size() == arity + 1) {
+ elements.removeFirst();
+ }
+ }
+
+ @Override
+ public Iterable<V> results() {
+ return ImmutableList.copyOf(elements);
+ }
+ }
+
+ private static class StringConcatAggregator extends SimpleAggregator<String> {
+ private final String separator;
+ private final boolean skipNulls;
+ private final long maxOutputLength;
+ private final long maxInputLength;
+ private long currentLength;
+ private final LinkedList<String> list = new LinkedList<String>();
+
+ private transient Joiner joiner;
+
+ public StringConcatAggregator(final String separator, final boolean skipNulls) {
+ this.separator = separator;
+ this.skipNulls = skipNulls;
+ this.maxInputLength = 0;
+ this.maxOutputLength = 0;
+ }
+
+ public StringConcatAggregator(final String separator, final boolean skipNull, final long maxOutputLength, final long maxInputLength) {
+ this.separator = separator;
+ this.skipNulls = skipNull;
+ this.maxOutputLength = maxOutputLength;
+ this.maxInputLength = maxInputLength;
+ this.currentLength = -separator.length();
+ }
+
+ @Override
+ public void reset() {
+ if (joiner == null) {
+ joiner = skipNulls ? Joiner.on(separator).skipNulls() : Joiner.on(separator);
+ }
+ currentLength = -separator.length();
+ list.clear();
+ }
+
+ @Override
+ public void update(final String next) {
+ long length = (next == null) ? 0 : next.length() + separator.length();
+ if (maxOutputLength > 0 && currentLength + length > maxOutputLength || maxInputLength > 0 && next.length() > maxInputLength) {
+ return;
+ }
+ if (maxOutputLength > 0) {
+ currentLength += length;
+ }
+ list.add(next);
+ }
+
+ @Override
+ public Iterable<String> results() {
+ return ImmutableList.of(joiner.join(list));
+ }
+ }
+
+
+ private static abstract class TupleAggregator<T> implements Aggregator<T> {
+ private final List<Aggregator<Object>> aggregators;
+
+ @SuppressWarnings("unchecked")
+ public TupleAggregator(Aggregator<?>... aggregators) {
+ this.aggregators = Lists.newArrayList();
+ for (Aggregator<?> a : aggregators) {
+ this.aggregators.add((Aggregator<Object>) a);
+ }
+ }
+
+ @Override
+ public void initialize(Configuration configuration) {
+ for (Aggregator<?> a : aggregators) {
+ a.initialize(configuration);
+ }
+ }
+
+ @Override
+ public void reset() {
+ for (Aggregator<?> a : aggregators) {
+ a.reset();
+ }
+ }
+
+ protected void updateTuple(Tuple t) {
+ for (int i = 0; i < aggregators.size(); i++) {
+ aggregators.get(i).update(t.get(i));
+ }
+ }
+
+ protected Iterable<Object> results(int index) {
+ return aggregators.get(index).results();
+ }
+ }
+
+ private static class PairAggregator<V1, V2> extends TupleAggregator<Pair<V1, V2>> {
+
+ public PairAggregator(Aggregator<V1> a1, Aggregator<V2> a2) {
+ super(a1, a2);
+ }
+
+ @Override
+ public void update(Pair<V1, V2> value) {
+ updateTuple(value);
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public Iterable<Pair<V1, V2>> results() {
+ return new Tuples.PairIterable<V1, V2>((Iterable<V1>) results(0), (Iterable<V2>) results(1));
+ }
+ }
+
+ private static class TripAggregator<A, B, C> extends TupleAggregator<Tuple3<A, B, C>> {
+
+ public TripAggregator(Aggregator<A> a1, Aggregator<B> a2, Aggregator<C> a3) {
+ super(a1, a2, a3);
+ }
+
+ @Override
+ public void update(Tuple3<A, B, C> value) {
+ updateTuple(value);
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public Iterable<Tuple3<A, B, C>> results() {
+ return new Tuples.TripIterable<A, B, C>((Iterable<A>) results(0), (Iterable<B>) results(1),
+ (Iterable<C>) results(2));
+ }
+ }
+
+ private static class QuadAggregator<A, B, C, D> extends TupleAggregator<Tuple4<A, B, C, D>> {
+
+ public QuadAggregator(Aggregator<A> a1, Aggregator<B> a2, Aggregator<C> a3, Aggregator<D> a4) {
+ super(a1, a2, a3, a4);
+ }
+
+ @Override
+ public void update(Tuple4<A, B, C, D> value) {
+ updateTuple(value);
+ }
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public Iterable<Tuple4<A, B, C, D>> results() {
+ return new Tuples.QuadIterable<A, B, C, D>((Iterable<A>) results(0), (Iterable<B>) results(1),
+ (Iterable<C>) results(2), (Iterable<D>) results(3));
+ }
+ }
+
+ private static class TupleNAggregator extends TupleAggregator<TupleN> {
+ private final int size;
+
+ public TupleNAggregator(Aggregator<?>... aggregators) {
+ super(aggregators);
+ size = aggregators.length;
+ }
+
+ @Override
+ public void update(TupleN value) {
+ updateTuple(value);
+ }
+
+ @Override
+ public Iterable<TupleN> results() {
+ Iterable<?>[] iterables = new Iterable[size];
+ for (int i = 0; i < size; i++) {
+ iterables[i] = results(i);
+ }
+ return new Tuples.TupleNIterable(iterables);
+ }
+ }
+
+ private static class SetAggregator<V> extends SimpleAggregator<V> {
+ private final Set<V> elements;
+ private final int sizeLimit;
+
+ public SetAggregator() {
+ this(-1);
+ }
+
+ public SetAggregator(int sizeLimit) {
+ this.elements = Sets.newHashSet();
+ this.sizeLimit = sizeLimit;
+ }
+
+ @Override
+ public void reset() {
+ elements.clear();
+ }
+
+ @Override
+ public void update(V value) {
+ if (sizeLimit == -1 || elements.size() < sizeLimit) {
+ elements.add(value);
+ }
+ }
+
+ @Override
+ public Iterable<V> results() {
+ return ImmutableList.copyOf(elements);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/fn/CompositeMapFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/fn/CompositeMapFn.java b/crunch-core/src/main/java/org/apache/crunch/fn/CompositeMapFn.java
new file mode 100644
index 0000000..2a8e7d9
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/fn/CompositeMapFn.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import org.apache.crunch.Emitter;
+import org.apache.crunch.MapFn;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+
+public class CompositeMapFn<R, S, T> extends MapFn<R, T> {
+
+ private final MapFn<R, S> first;
+ private final MapFn<S, T> second;
+
+ public CompositeMapFn(MapFn<R, S> first, MapFn<S, T> second) {
+ this.first = first;
+ this.second = second;
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ first.setContext(context);
+ second.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ first.initialize();
+ second.initialize();
+ }
+
+ public MapFn<R, S> getFirst() {
+ return first;
+ }
+
+ public MapFn<S, T> getSecond() {
+ return second;
+ }
+
+ @Override
+ public T map(R input) {
+ return second.map(first.map(input));
+ }
+
+ @Override
+ public void cleanup(Emitter<T> emitter) {
+ first.cleanup(null);
+ second.cleanup(null);
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ first.configure(conf);
+ second.configure(conf);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/fn/ExtractKeyFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/fn/ExtractKeyFn.java b/crunch-core/src/main/java/org/apache/crunch/fn/ExtractKeyFn.java
new file mode 100644
index 0000000..b8cc9df
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/fn/ExtractKeyFn.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Pair;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+
+/**
+ * Wrapper function for converting a {@code MapFn} into a key-value pair that is
+ * used to convert from a {@code PCollection<V>} to a {@code PTable<K, V>}.
+ */
+public class ExtractKeyFn<K, V> extends MapFn<V, Pair<K, V>> {
+
+ private final MapFn<V, K> mapFn;
+
+ public ExtractKeyFn(MapFn<V, K> mapFn) {
+ this.mapFn = mapFn;
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ mapFn.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ mapFn.initialize();
+ }
+
+ @Override
+ public Pair<K, V> map(V input) {
+ return Pair.of(mapFn.map(input), input);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/fn/FilterFns.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/fn/FilterFns.java b/crunch-core/src/main/java/org/apache/crunch/fn/FilterFns.java
new file mode 100644
index 0000000..8dc4268
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/fn/FilterFns.java
@@ -0,0 +1,112 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import org.apache.crunch.FilterFn;
+import org.apache.crunch.FilterFn.AndFn;
+import org.apache.crunch.FilterFn.NotFn;
+import org.apache.crunch.FilterFn.OrFn;
+
+
+/**
+ * A collection of pre-defined {@link FilterFn} implementations.
+ */
+public final class FilterFns {
+ // Note: We delegate to the deprecated implementation classes in FilterFn. When their
+ // time is up, we just move them here.
+
+ private FilterFns() {
+ // utility class, not for instantiation
+ }
+
+ /**
+ * Accept an entry if all of the given filters accept it, using short-circuit evaluation.
+ * @param fn1 The first functions to delegate to
+ * @param fn2 The second functions to delegate to
+ * @return The composed filter function
+ */
+ public static <S> FilterFn<S> and(FilterFn<S> fn1, FilterFn<S> fn2) {
+ return new AndFn<S>(fn1, fn2);
+ }
+
+ /**
+ * Accept an entry if all of the given filters accept it, using short-circuit evaluation.
+ * @param fns The functions to delegate to (in the given order)
+ * @return The composed filter function
+ */
+ public static <S> FilterFn<S> and(FilterFn<S>... fns) {
+ return new AndFn<S>(fns);
+ }
+
+ /**
+ * Accept an entry if at least one of the given filters accept it, using short-circuit evaluation.
+ * @param fn1 The first functions to delegate to
+ * @param fn2 The second functions to delegate to
+ * @return The composed filter function
+ */
+ public static <S> FilterFn<S> or(FilterFn<S> fn1, FilterFn<S> fn2) {
+ return new OrFn<S>(fn1, fn2);
+ }
+
+ /**
+ * Accept an entry if at least one of the given filters accept it, using short-circuit evaluation.
+ * @param fns The functions to delegate to (in the given order)
+ * @return The composed filter function
+ */
+ public static <S> FilterFn<S> or(FilterFn<S>... fns) {
+ return new OrFn<S>(fns);
+ }
+
+ /**
+ * Accept an entry if the given filter <em>does not</em> accept it.
+ * @param fn The function to delegate to
+ * @return The composed filter function
+ */
+ public static <S> FilterFn<S> not(FilterFn<S> fn) {
+ return new NotFn<S>(fn);
+ }
+
+ /**
+ * Accept everything.
+ * @return A filter function that accepts everything.
+ */
+ public static <S> FilterFn<S> ACCEPT_ALL() {
+ return new AcceptAllFn<S>();
+ }
+
+ /**
+ * Reject everything.
+ * @return A filter function that rejects everything.
+ */
+ public static <S> FilterFn<S> REJECT_ALL() {
+ return not(new AcceptAllFn<S>());
+ }
+
+ private static class AcceptAllFn<S> extends FilterFn<S> {
+ @Override
+ public boolean accept(S input) {
+ return true;
+ }
+
+ @Override
+ public float scaleFactor() {
+ return 1.0f;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/fn/IdentityFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/fn/IdentityFn.java b/crunch-core/src/main/java/org/apache/crunch/fn/IdentityFn.java
new file mode 100644
index 0000000..0eadb06
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/fn/IdentityFn.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import org.apache.crunch.MapFn;
+
+public class IdentityFn<T> extends MapFn<T, T> {
+
+ private static final IdentityFn<Object> INSTANCE = new IdentityFn<Object>();
+
+ @SuppressWarnings("unchecked")
+ public static <T> IdentityFn<T> getInstance() {
+ return (IdentityFn<T>) INSTANCE;
+ }
+
+ // Non-instantiable
+ private IdentityFn() {
+ }
+
+ @Override
+ public T map(T input) {
+ return input;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/fn/MapKeysFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/fn/MapKeysFn.java b/crunch-core/src/main/java/org/apache/crunch/fn/MapKeysFn.java
new file mode 100644
index 0000000..cbaf24d
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/fn/MapKeysFn.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.Pair;
+
+public abstract class MapKeysFn<K1, K2, V> extends DoFn<Pair<K1, V>, Pair<K2, V>> {
+
+ @Override
+ public void process(Pair<K1, V> input, Emitter<Pair<K2, V>> emitter) {
+ emitter.emit(Pair.of(map(input.first()), input.second()));
+ }
+
+ public abstract K2 map(K1 k1);
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/fn/MapValuesFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/fn/MapValuesFn.java b/crunch-core/src/main/java/org/apache/crunch/fn/MapValuesFn.java
new file mode 100644
index 0000000..b90f5ff
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/fn/MapValuesFn.java
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.Pair;
+
+public abstract class MapValuesFn<K, V1, V2> extends DoFn<Pair<K, V1>, Pair<K, V2>> {
+
+ @Override
+ public void process(Pair<K, V1> input, Emitter<Pair<K, V2>> emitter) {
+ emitter.emit(Pair.of(input.first(), map(input.second())));
+ }
+
+ public abstract V2 map(V1 v);
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/fn/PairMapFn.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/fn/PairMapFn.java b/crunch-core/src/main/java/org/apache/crunch/fn/PairMapFn.java
new file mode 100644
index 0000000..9ee4336
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/fn/PairMapFn.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import org.apache.crunch.Emitter;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Pair;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+
+public class PairMapFn<K, V, S, T> extends MapFn<Pair<K, V>, Pair<S, T>> {
+
+ private MapFn<K, S> keys;
+ private MapFn<V, T> values;
+
+ public PairMapFn(MapFn<K, S> keys, MapFn<V, T> values) {
+ this.keys = keys;
+ this.values = values;
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ keys.configure(conf);
+ values.configure(conf);
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ keys.setContext(context);
+ values.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ keys.initialize();
+ values.initialize();
+ }
+
+ @Override
+ public Pair<S, T> map(Pair<K, V> input) {
+ return Pair.of(keys.map(input.first()), values.map(input.second()));
+ }
+
+ @Override
+ public void cleanup(Emitter<Pair<S, T>> emitter) {
+ keys.cleanup(null);
+ values.cleanup(null);
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/fn/package-info.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/fn/package-info.java b/crunch-core/src/main/java/org/apache/crunch/fn/package-info.java
new file mode 100644
index 0000000..acefdff
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/fn/package-info.java
@@ -0,0 +1,22 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Commonly used functions for manipulating collections.
+ */
+package org.apache.crunch.fn;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/hadoop/mapreduce/TaskAttemptContextFactory.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/hadoop/mapreduce/TaskAttemptContextFactory.java b/crunch-core/src/main/java/org/apache/crunch/hadoop/mapreduce/TaskAttemptContextFactory.java
new file mode 100644
index 0000000..887c051
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/hadoop/mapreduce/TaskAttemptContextFactory.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.hadoop.mapreduce;
+
+import java.lang.reflect.Constructor;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+
+/**
+ * A factory class that allows us to hide the fact that {@code TaskAttemptContext} is a class in
+ * Hadoop 1.x.x and an interface in Hadoop 2.x.x.
+ */
+@SuppressWarnings("unchecked")
+public class TaskAttemptContextFactory {
+
+ private static final Log LOG = LogFactory.getLog(TaskAttemptContextFactory.class);
+
+ private static final TaskAttemptContextFactory INSTANCE = new TaskAttemptContextFactory();
+
+ public static TaskAttemptContext create(Configuration conf, TaskAttemptID taskAttemptId) {
+ return INSTANCE.createInternal(conf, taskAttemptId);
+ }
+
+ private Constructor<TaskAttemptContext> taskAttemptConstructor;
+
+ private TaskAttemptContextFactory() {
+ Class<TaskAttemptContext> implClass = TaskAttemptContext.class;
+ if (implClass.isInterface()) {
+ try {
+ implClass = (Class<TaskAttemptContext>) Class.forName(
+ "org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl");
+ } catch (ClassNotFoundException e) {
+ LOG.fatal("Could not find TaskAttemptContextImpl class, exiting", e);
+ }
+ }
+ try {
+ this.taskAttemptConstructor = implClass.getConstructor(Configuration.class, TaskAttemptID.class);
+ } catch (Exception e) {
+ LOG.fatal("Could not access TaskAttemptContext constructor, exiting", e);
+ }
+ }
+
+ private TaskAttemptContext createInternal(Configuration conf, TaskAttemptID taskAttemptId) {
+ try {
+ return (TaskAttemptContext) taskAttemptConstructor.newInstance(conf, taskAttemptId);
+ } catch (Exception e) {
+ LOG.error("Could not construct a TaskAttemptContext instance", e);
+ return null;
+ }
+ }
+}
[09/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/impl/SourceTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/impl/SourceTargetImpl.java b/crunch/src/main/java/org/apache/crunch/io/impl/SourceTargetImpl.java
deleted file mode 100644
index 4d2b88a..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/impl/SourceTargetImpl.java
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.impl;
-
-import java.io.IOException;
-
-import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.crunch.Source;
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.Target;
-import org.apache.crunch.io.OutputHandler;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.Job;
-
-class SourceTargetImpl<T> implements SourceTarget<T> {
-
- protected final Source<T> source;
- protected final Target target;
-
- public SourceTargetImpl(Source<T> source, Target target) {
- this.source = source;
- this.target = target;
- }
-
- @Override
- public PType<T> getType() {
- return source.getType();
- }
-
- @Override
- public void configureSource(Job job, int inputId) throws IOException {
- source.configureSource(job, inputId);
- }
-
- @Override
- public long getSize(Configuration configuration) {
- return source.getSize(configuration);
- }
-
- @Override
- public boolean accept(OutputHandler handler, PType<?> ptype) {
- return target.accept(handler, ptype);
- }
-
- @Override
- public <S> SourceTarget<S> asSourceTarget(PType<S> ptype) {
- return target.asSourceTarget(ptype);
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !(other.getClass().equals(getClass()))) {
- return false;
- }
- SourceTargetImpl sti = (SourceTargetImpl) other;
- return source.equals(sti.source) && target.equals(sti.target);
- }
-
- @Override
- public int hashCode() {
- return new HashCodeBuilder().append(source).append(target).toHashCode();
- }
-
- @Override
- public String toString() {
- return source.toString();
- }
-
- @Override
- public void handleExisting(WriteMode strategy, Configuration conf) {
- target.handleExisting(strategy, conf);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/impl/TableSourcePathTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/impl/TableSourcePathTargetImpl.java b/crunch/src/main/java/org/apache/crunch/io/impl/TableSourcePathTargetImpl.java
deleted file mode 100644
index a8ff639..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/impl/TableSourcePathTargetImpl.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.impl;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.TableSource;
-import org.apache.crunch.io.FileNamingScheme;
-import org.apache.crunch.io.PathTarget;
-import org.apache.crunch.io.SequentialFileNamingScheme;
-import org.apache.crunch.types.PTableType;
-
-public class TableSourcePathTargetImpl<K, V> extends SourcePathTargetImpl<Pair<K, V>> implements TableSource<K, V> {
-
- public TableSourcePathTargetImpl(TableSource<K, V> source, PathTarget target) {
- this(source, target, new SequentialFileNamingScheme());
- }
-
- public TableSourcePathTargetImpl(TableSource<K, V> source, PathTarget target, FileNamingScheme fileNamingScheme) {
- super(source, target, fileNamingScheme);
- }
-
- @Override
- public PTableType<K, V> getTableType() {
- return ((TableSource<K, V>) source).getTableType();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/impl/TableSourceTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/impl/TableSourceTargetImpl.java b/crunch/src/main/java/org/apache/crunch/io/impl/TableSourceTargetImpl.java
deleted file mode 100644
index 965b0f9..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/impl/TableSourceTargetImpl.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.impl;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.TableSource;
-import org.apache.crunch.Target;
-import org.apache.crunch.types.PTableType;
-
-public class TableSourceTargetImpl<K, V> extends SourceTargetImpl<Pair<K, V>> implements TableSource<K, V> {
-
- public TableSourceTargetImpl(TableSource<K, V> source, Target target) {
- super(source, target);
- }
-
- @Override
- public PTableType<K, V> getTableType() {
- return ((TableSource<K, V>) source).getTableType();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/package-info.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/package-info.java b/crunch/src/main/java/org/apache/crunch/io/package-info.java
deleted file mode 100644
index 022bc99..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Data input and output for Pipelines.
- */
-package org.apache.crunch.io;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileHelper.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileHelper.java b/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileHelper.java
deleted file mode 100644
index ba07506..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileHelper.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.seq;
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.writable.WritableType;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.util.ReflectionUtils;
-
-class SeqFileHelper {
- static <T> Writable newInstance(PType<T> ptype, Configuration conf) {
- return (Writable) ReflectionUtils.newInstance(((WritableType) ptype).getSerializationClass(), conf);
- }
-
- static <T> MapFn<Object, T> getInputMapFn(PType<T> ptype) {
- return ptype.getInputMapFn();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileReaderFactory.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileReaderFactory.java b/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileReaderFactory.java
deleted file mode 100644
index 3f45644..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileReaderFactory.java
+++ /dev/null
@@ -1,112 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.seq;
-
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.fn.IdentityFn;
-import org.apache.crunch.io.FileReaderFactory;
-import org.apache.crunch.io.impl.AutoClosingIterator;
-import org.apache.crunch.types.Converter;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.writable.Writables;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.util.ReflectionUtils;
-
-import com.google.common.collect.Iterators;
-import com.google.common.collect.UnmodifiableIterator;
-
-public class SeqFileReaderFactory<T> implements FileReaderFactory<T> {
-
- private static final Log LOG = LogFactory.getLog(SeqFileReaderFactory.class);
-
- private final Converter converter;
- private final MapFn<Object, T> mapFn;
- private final Writable key;
- private final Writable value;
-
- public SeqFileReaderFactory(PType<T> ptype) {
- this.converter = ptype.getConverter();
- this.mapFn = ptype.getInputMapFn();
- if (ptype instanceof PTableType) {
- PTableType ptt = (PTableType) ptype;
- this.key = SeqFileHelper.newInstance(ptt.getKeyType(), null);
- this.value = SeqFileHelper.newInstance(ptt.getValueType(), null);
- } else {
- this.key = NullWritable.get();
- this.value = SeqFileHelper.newInstance(ptype, null);
- }
- }
-
- public SeqFileReaderFactory(Class clazz) {
- PType<T> ptype = Writables.writables(clazz);
- this.converter = ptype.getConverter();
- this.mapFn = ptype.getInputMapFn();
- this.key = NullWritable.get();
- this.value = (Writable) ReflectionUtils.newInstance(clazz, null);
- }
-
- @Override
- public Iterator<T> read(FileSystem fs, final Path path) {
- mapFn.initialize();
- try {
- final SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());
- return new AutoClosingIterator<T>(reader, new UnmodifiableIterator<T>() {
- boolean nextChecked = false;
- boolean hasNext = false;
-
- @Override
- public boolean hasNext() {
- if (nextChecked == true) {
- return hasNext;
- }
- try {
- hasNext = reader.next(key, value);
- nextChecked = true;
- return hasNext;
- } catch (IOException e) {
- LOG.info("Error reading from path: " + path, e);
- return false;
- }
- }
-
- @Override
- public T next() {
- if (!nextChecked && !hasNext()) {
- return null;
- }
- nextChecked = false;
- return mapFn.map(converter.convertInput(key, value));
- }
- });
- } catch (IOException e) {
- LOG.info("Could not read seqfile at path: " + path, e);
- return Iterators.emptyIterator();
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileSource.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileSource.java b/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileSource.java
deleted file mode 100644
index 8fac4ae..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileSource.java
+++ /dev/null
@@ -1,47 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.seq;
-
-import java.io.IOException;
-
-import org.apache.crunch.io.CompositePathIterable;
-import org.apache.crunch.io.ReadableSource;
-import org.apache.crunch.io.impl.FileSourceImpl;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-
-public class SeqFileSource<T> extends FileSourceImpl<T> implements ReadableSource<T> {
-
- public SeqFileSource(Path path, PType<T> ptype) {
- super(path, ptype, SequenceFileInputFormat.class);
- }
-
- @Override
- public Iterable<T> read(Configuration conf) throws IOException {
- FileSystem fs = path.getFileSystem(conf);
- return CompositePathIterable.create(fs, path, new SeqFileReaderFactory<T>(ptype));
- }
-
- @Override
- public String toString() {
- return "SeqFile(" + path.toString() + ")";
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileSourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileSourceTarget.java b/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileSourceTarget.java
deleted file mode 100644
index adc739f..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileSourceTarget.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.seq;
-
-import org.apache.crunch.io.FileNamingScheme;
-import org.apache.crunch.io.SequentialFileNamingScheme;
-import org.apache.crunch.io.impl.ReadableSourcePathTargetImpl;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.fs.Path;
-
-public class SeqFileSourceTarget<T> extends ReadableSourcePathTargetImpl<T> {
-
- public SeqFileSourceTarget(String path, PType<T> ptype) {
- this(new Path(path), ptype);
- }
-
- public SeqFileSourceTarget(Path path, PType<T> ptype) {
- this(path, ptype, new SequentialFileNamingScheme());
- }
-
- public SeqFileSourceTarget(Path path, PType<T> ptype, FileNamingScheme fileNamingScheme) {
- super(new SeqFileSource<T>(path, ptype), new SeqFileTarget(path), fileNamingScheme);
- }
-
- @Override
- public String toString() {
- return target.toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileTableSource.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileTableSource.java b/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileTableSource.java
deleted file mode 100644
index 7a63272..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileTableSource.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.seq;
-
-import java.io.IOException;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.io.CompositePathIterable;
-import org.apache.crunch.io.ReadableSource;
-import org.apache.crunch.io.impl.FileTableSourceImpl;
-import org.apache.crunch.types.PTableType;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-
-/**
- * A {@code TableSource} that uses {@code SequenceFileInputFormat} to read the input
- * file.
- */
-public class SeqFileTableSource<K, V> extends FileTableSourceImpl<K, V> implements ReadableSource<Pair<K, V>> {
-
- public SeqFileTableSource(String path, PTableType<K, V> ptype) {
- this(new Path(path), ptype);
- }
-
- public SeqFileTableSource(Path path, PTableType<K, V> ptype) {
- super(path, ptype, SequenceFileInputFormat.class);
- }
-
- @Override
- public Iterable<Pair<K, V>> read(Configuration conf) throws IOException {
- FileSystem fs = path.getFileSystem(conf);
- return CompositePathIterable.create(fs, path,
- new SeqFileReaderFactory<Pair<K, V>>(getTableType()));
- }
-
- @Override
- public String toString() {
- return "SeqFile(" + path.toString() + ")";
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileTableSourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileTableSourceTarget.java b/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileTableSourceTarget.java
deleted file mode 100644
index ebdf319..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileTableSourceTarget.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.seq;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.TableSourceTarget;
-import org.apache.crunch.io.FileNamingScheme;
-import org.apache.crunch.io.SequentialFileNamingScheme;
-import org.apache.crunch.io.impl.ReadableSourcePathTargetImpl;
-import org.apache.crunch.types.PTableType;
-import org.apache.hadoop.fs.Path;
-
-public class SeqFileTableSourceTarget<K, V> extends ReadableSourcePathTargetImpl<Pair<K, V>> implements
- TableSourceTarget<K, V> {
- private final PTableType<K, V> tableType;
-
- public SeqFileTableSourceTarget(String path, PTableType<K, V> tableType) {
- this(new Path(path), tableType);
- }
-
- public SeqFileTableSourceTarget(Path path, PTableType<K, V> tableType) {
- this(path, tableType, new SequentialFileNamingScheme());
- }
-
- public SeqFileTableSourceTarget(Path path, PTableType<K, V> tableType, FileNamingScheme fileNamingScheme) {
- super(new SeqFileTableSource<K, V>(path, tableType), new SeqFileTarget(path), fileNamingScheme);
- this.tableType = tableType;
- }
-
- @Override
- public PTableType<K, V> getTableType() {
- return tableType;
- }
-
- @Override
- public String toString() {
- return target.toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileTarget.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileTarget.java b/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileTarget.java
deleted file mode 100644
index 60e4739..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/seq/SeqFileTarget.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.seq;
-
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.io.FileNamingScheme;
-import org.apache.crunch.io.SequentialFileNamingScheme;
-import org.apache.crunch.io.impl.FileTargetImpl;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-
-public class SeqFileTarget extends FileTargetImpl {
- public SeqFileTarget(String path) {
- this(new Path(path));
- }
-
- public SeqFileTarget(Path path) {
- this(path, new SequentialFileNamingScheme());
- }
-
- public SeqFileTarget(Path path, FileNamingScheme fileNamingScheme) {
- super(path, SequenceFileOutputFormat.class, fileNamingScheme);
- }
-
- @Override
- public String toString() {
- return "SeqFile(" + path.toString() + ")";
- }
-
- @Override
- public <T> SourceTarget<T> asSourceTarget(PType<T> ptype) {
- if (ptype instanceof PTableType) {
- return new SeqFileTableSourceTarget(path, (PTableType) ptype);
- } else {
- return new SeqFileSourceTarget(path, ptype);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/text/BZip2TextInputFormat.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/text/BZip2TextInputFormat.java b/crunch/src/main/java/org/apache/crunch/io/text/BZip2TextInputFormat.java
deleted file mode 100644
index 67a8870..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/text/BZip2TextInputFormat.java
+++ /dev/null
@@ -1,235 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.text;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.JobContext;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-
-class BZip2TextInputFormat extends FileInputFormat<LongWritable, Text> {
- /**
- * Treats keys as offset in file and value as line. Since the input file is
- * compressed, the offset for a particular line is not well-defined. This
- * implementation returns the starting position of a compressed block as the
- * key for every line in that block.
- */
-
- private static class BZip2LineRecordReader extends RecordReader<LongWritable, Text> {
-
- private long start;
-
- private long end;
-
- private long pos;
-
- private CBZip2InputStream in;
-
- private ByteArrayOutputStream buffer = new ByteArrayOutputStream(256);
-
- // flag to indicate if previous character read was Carriage Return ('\r')
- // and the next character was not Line Feed ('\n')
- private boolean CRFollowedByNonLF = false;
-
- // in the case where a Carriage Return ('\r') was not followed by a
- // Line Feed ('\n'), this variable will hold that non Line Feed character
- // that was read from the underlying stream.
- private byte nonLFChar;
-
- /**
- * Provide a bridge to get the bytes from the ByteArrayOutputStream without
- * creating a new byte array.
- */
- private static class TextStuffer extends OutputStream {
- public Text target;
-
- @Override
- public void write(int b) {
- throw new UnsupportedOperationException("write(byte) not supported");
- }
-
- @Override
- public void write(byte[] data, int offset, int len) throws IOException {
- target.clear();
- target.set(data, offset, len);
- }
- }
-
- private TextStuffer bridge = new TextStuffer();
-
- private LongWritable key = new LongWritable();
- private Text value = new Text();
-
- public BZip2LineRecordReader(Configuration job, FileSplit split) throws IOException {
- start = split.getStart();
- end = start + split.getLength();
- final Path file = split.getPath();
-
- // open the file and seek to the start of the split
- FileSystem fs = file.getFileSystem(job);
- FSDataInputStream fileIn = fs.open(split.getPath());
- fileIn.seek(start);
-
- in = new CBZip2InputStream(fileIn, 9, end);
- if (start != 0) {
- // skip first line and re-establish "start".
- // LineRecordReader.readLine(this.in, null);
- readLine(this.in, null);
- start = in.getPos();
- }
- pos = in.getPos();
- }
-
- /*
- * LineRecordReader.readLine() is depricated in HAdoop 0.17. So it is added
- * here locally.
- */
- private long readLine(InputStream in, OutputStream out) throws IOException {
- long bytes = 0;
- while (true) {
- int b = -1;
- if (CRFollowedByNonLF) {
- // In the previous call, a Carriage Return ('\r') was followed
- // by a non Line Feed ('\n') character - in that call we would
- // have not returned the non Line Feed character but would have
- // read it from the stream - lets use that already read character
- // now
- b = nonLFChar;
- CRFollowedByNonLF = false;
- } else {
- b = in.read();
- }
- if (b == -1) {
- break;
- }
- bytes += 1;
-
- byte c = (byte) b;
- if (c == '\n') {
- break;
- }
-
- if (c == '\r') {
- byte nextC = (byte) in.read();
- if (nextC != '\n') {
- CRFollowedByNonLF = true;
- nonLFChar = nextC;
- } else {
- bytes += 1;
- }
- break;
- }
-
- if (out != null) {
- out.write(c);
- }
- }
- return bytes;
- }
-
- /** Read a line. */
- public boolean next(LongWritable key, Text value) throws IOException {
- if (pos > end)
- return false;
-
- key.set(pos); // key is position
- buffer.reset();
- // long bytesRead = LineRecordReader.readLine(in, buffer);
- long bytesRead = readLine(in, buffer);
- if (bytesRead == 0) {
- return false;
- }
- pos = in.getPos();
- // if we have read ahead because we encountered a carriage return
- // char followed by a non line feed char, decrement the pos
- if (CRFollowedByNonLF) {
- pos--;
- }
-
- bridge.target = value;
- buffer.writeTo(bridge);
- return true;
- }
-
- /**
- * Get the progress within the split
- */
- @Override
- public float getProgress() {
- if (start == end) {
- return 0.0f;
- } else {
- return Math.min(1.0f, (pos - start) / (float) (end - start));
- }
- }
-
- @Override
- public void close() throws IOException {
- in.close();
- }
-
- @Override
- public LongWritable getCurrentKey() throws IOException, InterruptedException {
- return key;
- }
-
- @Override
- public Text getCurrentValue() throws IOException, InterruptedException {
- return value;
- }
-
- @Override
- public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
- // no op
- }
-
- @Override
- public boolean nextKeyValue() throws IOException, InterruptedException {
- return next(key, value);
- }
-
- }
-
- @Override
- protected boolean isSplitable(JobContext context, Path file) {
- return true;
- }
-
- @Override
- public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
- try {
- return new BZip2LineRecordReader(context.getConfiguration(), (FileSplit) split);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/text/CBZip2InputStream.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/text/CBZip2InputStream.java b/crunch/src/main/java/org/apache/crunch/io/text/CBZip2InputStream.java
deleted file mode 100644
index 92bb787..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/text/CBZip2InputStream.java
+++ /dev/null
@@ -1,980 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.text;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.io.compress.bzip2.BZip2Constants;
-import org.apache.hadoop.mapreduce.InputSplit;
-
-/**
- * An input stream that decompresses from the BZip2 format (without the file
- * header chars) to be read as any other stream.
- *
- * @author <a href="mailto:keiron@aftexsw.com">Keiron Liddle</a>
- */
-class CBZip2InputStream extends InputStream implements BZip2Constants {
- private static void cadvise(String reason) throws IOException {
- throw new IOException(reason);
- }
-
- private static void compressedStreamEOF() throws IOException {
- cadvise("compressedStream EOF");
- }
-
- private void makeMaps() {
- int i;
- nInUse = 0;
- for (i = 0; i < 256; i++) {
- if (inUse[i]) {
- seqToUnseq[nInUse] = (char) i;
- unseqToSeq[i] = (char) nInUse;
- nInUse++;
- }
- }
- }
-
- /*
- * index of the last char in the block, so the block size == last + 1.
- */
- private int last;
-
- /*
- * index in zptr[] of original string after sorting.
- */
- private int origPtr;
-
- /*
- * always: in the range 0 .. 9. The current block size is 100000 * this
- * number.
- */
- private int blockSize100k;
-
- private boolean blockRandomised;
-
- // a buffer to keep the read byte
- private int bsBuff;
-
- // since bzip is bit-aligned at block boundaries there can be a case wherein
- // only few bits out of a read byte are consumed and the remaining bits
- // need to be consumed while processing the next block.
- // indicate how many bits in bsBuff have not been processed yet
- private int bsLive;
- private CRC mCrc = new CRC();
-
- private boolean[] inUse = new boolean[256];
- private int nInUse;
-
- private char[] seqToUnseq = new char[256];
- private char[] unseqToSeq = new char[256];
-
- private char[] selector = new char[MAX_SELECTORS];
- private char[] selectorMtf = new char[MAX_SELECTORS];
-
- private int[] tt;
- private char[] ll8;
-
- /*
- * freq table collected to save a pass over the data during decompression.
- */
- private int[] unzftab = new int[256];
-
- private int[][] limit = new int[N_GROUPS][MAX_ALPHA_SIZE];
- private int[][] base = new int[N_GROUPS][MAX_ALPHA_SIZE];
- private int[][] perm = new int[N_GROUPS][MAX_ALPHA_SIZE];
- private int[] minLens = new int[N_GROUPS];
-
- private FSDataInputStream innerBsStream;
- long readLimit = Long.MAX_VALUE;
-
- public long getReadLimit() {
- return readLimit;
- }
-
- public void setReadLimit(long readLimit) {
- this.readLimit = readLimit;
- }
-
- long readCount;
-
- public long getReadCount() {
- return readCount;
- }
-
- private boolean streamEnd = false;
-
- private int currentChar = -1;
-
- private static final int START_BLOCK_STATE = 1;
- private static final int RAND_PART_A_STATE = 2;
- private static final int RAND_PART_B_STATE = 3;
- private static final int RAND_PART_C_STATE = 4;
- private static final int NO_RAND_PART_A_STATE = 5;
- private static final int NO_RAND_PART_B_STATE = 6;
- private static final int NO_RAND_PART_C_STATE = 7;
-
- private int currentState = START_BLOCK_STATE;
-
- private int storedBlockCRC, storedCombinedCRC;
- private int computedBlockCRC, computedCombinedCRC;
- private boolean checkComputedCombinedCRC = true;
-
- int i2, count, chPrev, ch2;
- int i, tPos;
- int rNToGo = 0;
- int rTPos = 0;
- int j2;
- char z;
-
- // see comment in getPos()
- private long retPos = -1;
- // the position offset which corresponds to the end of the InputSplit that
- // will be processed by this instance
- private long endOffsetOfSplit;
-
- private boolean signalToStopReading;
-
- public CBZip2InputStream(FSDataInputStream zStream, int blockSize, long end) throws IOException {
- endOffsetOfSplit = end;
- // initialize retPos to the beginning of the current InputSplit
- // see comments in getPos() to understand how this is used.
- retPos = zStream.getPos();
- ll8 = null;
- tt = null;
- checkComputedCombinedCRC = blockSize == -1;
- bsSetStream(zStream);
- initialize(blockSize);
- initBlock(blockSize != -1);
- setupBlock();
- }
-
- @Override
- public int read() throws IOException {
- if (streamEnd) {
- return -1;
- } else {
-
- // if we just started reading a bzip block which starts at a position
- // >= end of current split, then we should set up retpos such that
- // after a record is read, future getPos() calls will get a value
- // > end of current split - this way we will read only one record out
- // of this bzip block - the rest of the records from this bzip block
- // should be read by the next map task while processing the next split
- if (signalToStopReading) {
- retPos = endOffsetOfSplit + 1;
- }
-
- int retChar = currentChar;
- switch (currentState) {
- case START_BLOCK_STATE:
- break;
- case RAND_PART_A_STATE:
- break;
- case RAND_PART_B_STATE:
- setupRandPartB();
- break;
- case RAND_PART_C_STATE:
- setupRandPartC();
- break;
- case NO_RAND_PART_A_STATE:
- break;
- case NO_RAND_PART_B_STATE:
- setupNoRandPartB();
- break;
- case NO_RAND_PART_C_STATE:
- setupNoRandPartC();
- break;
- default:
- break;
- }
- return retChar;
- }
- }
-
- /**
- * getPos is used by the caller to know when the processing of the current
- * {@link InputSplit} is complete. In this method, as we read each bzip block,
- * we keep returning the beginning of the {@link InputSplit} as the return
- * value until we hit a block which starts at a position >= end of current
- * split. At that point we should set up retpos such that after a record is
- * read, future getPos() calls will get a value > end of current split - this
- * way we will read only one record out of that bzip block - the rest of the
- * records from that bzip block should be read by the next map task while
- * processing the next split
- *
- * @return
- * @throws IOException
- */
- public long getPos() throws IOException {
- return retPos;
- }
-
- private void initialize(int blockSize) throws IOException {
- if (blockSize == -1) {
- char magic1, magic2;
- char magic3, magic4;
- magic1 = bsGetUChar();
- magic2 = bsGetUChar();
- magic3 = bsGetUChar();
- magic4 = bsGetUChar();
- if (magic1 != 'B' || magic2 != 'Z' || magic3 != 'h' || magic4 < '1' || magic4 > '9') {
- bsFinishedWithStream();
- streamEnd = true;
- return;
- }
- blockSize = magic4 - '0';
- }
-
- setDecompressStructureSizes(blockSize);
- computedCombinedCRC = 0;
- }
-
- private final static long mask = 0xffffffffffffL;
- private final static long eob = 0x314159265359L & mask;
- private final static long eos = 0x177245385090L & mask;
-
- private void initBlock(boolean searchForMagic) throws IOException {
- if (readCount >= readLimit) {
- bsFinishedWithStream();
- streamEnd = true;
- return;
- }
-
- // position before beginning of bzip block header
- long pos = innerBsStream.getPos();
- if (!searchForMagic) {
- char magic1, magic2, magic3, magic4;
- char magic5, magic6;
- magic1 = bsGetUChar();
- magic2 = bsGetUChar();
- magic3 = bsGetUChar();
- magic4 = bsGetUChar();
- magic5 = bsGetUChar();
- magic6 = bsGetUChar();
- if (magic1 == 0x17 && magic2 == 0x72 && magic3 == 0x45 && magic4 == 0x38 && magic5 == 0x50 && magic6 == 0x90) {
- complete();
- return;
- }
-
- if (magic1 != 0x31 || magic2 != 0x41 || magic3 != 0x59 || magic4 != 0x26 || magic5 != 0x53 || magic6 != 0x59) {
- badBlockHeader();
- streamEnd = true;
- return;
- }
- } else {
- long magic = 0;
- for (int i = 0; i < 6; i++) {
- magic <<= 8;
- magic |= bsGetUChar();
- }
- while (magic != eos && magic != eob) {
- magic <<= 1;
- magic &= mask;
- magic |= bsR(1);
- // if we just found the block header, the beginning of the bzip
- // header would be 6 bytes before the current stream position
- // when we eventually break from this while(), if it is because
- // we found a block header then pos will have the correct start
- // of header position
- pos = innerBsStream.getPos() - 6;
- }
- if (magic == eos) {
- complete();
- return;
- }
-
- }
- // if the previous block finished a few bits into the previous byte,
- // then we will first be reading the remaining bits from the previous
- // byte - so logically pos needs to be one behind
- if (bsLive > 0) {
- pos--;
- }
-
- if (pos >= endOffsetOfSplit) {
- // we have reached a block which begins exactly at the next InputSplit
- // or >1 byte into the next InputSplit - lets record this fact
- signalToStopReading = true;
- }
- storedBlockCRC = bsGetInt32();
-
- if (bsR(1) == 1) {
- blockRandomised = true;
- } else {
- blockRandomised = false;
- }
-
- // currBlockNo++;
- getAndMoveToFrontDecode();
-
- mCrc.initialiseCRC();
- currentState = START_BLOCK_STATE;
- }
-
- private void endBlock() throws IOException {
- computedBlockCRC = mCrc.getFinalCRC();
- /* A bad CRC is considered a fatal error. */
- if (storedBlockCRC != computedBlockCRC) {
- crcError();
- }
-
- computedCombinedCRC = (computedCombinedCRC << 1) | (computedCombinedCRC >>> 31);
- computedCombinedCRC ^= computedBlockCRC;
- }
-
- private void complete() throws IOException {
- storedCombinedCRC = bsGetInt32();
- if (checkComputedCombinedCRC && storedCombinedCRC != computedCombinedCRC) {
- crcError();
- }
- if (innerBsStream.getPos() < endOffsetOfSplit) {
- throw new IOException("Encountered additional bytes in the filesplit past the crc block. "
- + "Loading of concatenated bz2 files is not supported");
- }
- bsFinishedWithStream();
- streamEnd = true;
- }
-
- private static void blockOverrun() throws IOException {
- cadvise("block overrun");
- }
-
- private static void badBlockHeader() throws IOException {
- cadvise("bad block header");
- }
-
- private static void crcError() throws IOException {
- cadvise("CRC error");
- }
-
- private void bsFinishedWithStream() {
- if (this.innerBsStream != null) {
- if (this.innerBsStream != System.in) {
- this.innerBsStream = null;
- }
- }
- }
-
- private void bsSetStream(FSDataInputStream f) {
- innerBsStream = f;
- bsLive = 0;
- bsBuff = 0;
- }
-
- final private int readBs() throws IOException {
- readCount++;
- return innerBsStream.read();
- }
-
- private int bsR(int n) throws IOException {
- int v;
- while (bsLive < n) {
- int zzi;
- zzi = readBs();
- if (zzi == -1) {
- compressedStreamEOF();
- }
- bsBuff = (bsBuff << 8) | (zzi & 0xff);
- bsLive += 8;
- }
-
- v = (bsBuff >> (bsLive - n)) & ((1 << n) - 1);
- bsLive -= n;
- return v;
- }
-
- private char bsGetUChar() throws IOException {
- return (char) bsR(8);
- }
-
- private int bsGetint() throws IOException {
- int u = 0;
- u = (u << 8) | bsR(8);
- u = (u << 8) | bsR(8);
- u = (u << 8) | bsR(8);
- u = (u << 8) | bsR(8);
- return u;
- }
-
- private int bsGetIntVS(int numBits) throws IOException {
- return bsR(numBits);
- }
-
- private int bsGetInt32() throws IOException {
- return bsGetint();
- }
-
- private void hbCreateDecodeTables(int[] limit, int[] base, int[] perm, char[] length, int minLen, int maxLen,
- int alphaSize) {
- int pp, i, j, vec;
-
- pp = 0;
- for (i = minLen; i <= maxLen; i++) {
- for (j = 0; j < alphaSize; j++) {
- if (length[j] == i) {
- perm[pp] = j;
- pp++;
- }
- }
- }
-
- for (i = 0; i < MAX_CODE_LEN; i++) {
- base[i] = 0;
- }
- for (i = 0; i < alphaSize; i++) {
- base[length[i] + 1]++;
- }
-
- for (i = 1; i < MAX_CODE_LEN; i++) {
- base[i] += base[i - 1];
- }
-
- for (i = 0; i < MAX_CODE_LEN; i++) {
- limit[i] = 0;
- }
- vec = 0;
-
- for (i = minLen; i <= maxLen; i++) {
- vec += (base[i + 1] - base[i]);
- limit[i] = vec - 1;
- vec <<= 1;
- }
- for (i = minLen + 1; i <= maxLen; i++) {
- base[i] = ((limit[i - 1] + 1) << 1) - base[i];
- }
- }
-
- private void recvDecodingTables() throws IOException {
- char len[][] = new char[N_GROUPS][MAX_ALPHA_SIZE];
- int i, j, t, nGroups, nSelectors, alphaSize;
- int minLen, maxLen;
- boolean[] inUse16 = new boolean[16];
-
- /* Receive the mapping table */
- for (i = 0; i < 16; i++) {
- if (bsR(1) == 1) {
- inUse16[i] = true;
- } else {
- inUse16[i] = false;
- }
- }
-
- for (i = 0; i < 256; i++) {
- inUse[i] = false;
- }
-
- for (i = 0; i < 16; i++) {
- if (inUse16[i]) {
- for (j = 0; j < 16; j++) {
- if (bsR(1) == 1) {
- inUse[i * 16 + j] = true;
- }
- }
- }
- }
-
- makeMaps();
- alphaSize = nInUse + 2;
-
- /* Now the selectors */
- nGroups = bsR(3);
- nSelectors = bsR(15);
- for (i = 0; i < nSelectors; i++) {
- j = 0;
- while (bsR(1) == 1) {
- j++;
- }
- selectorMtf[i] = (char) j;
- }
-
- /* Undo the MTF values for the selectors. */
- {
- char[] pos = new char[N_GROUPS];
- char tmp, v;
- for (v = 0; v < nGroups; v++) {
- pos[v] = v;
- }
-
- for (i = 0; i < nSelectors; i++) {
- v = selectorMtf[i];
- tmp = pos[v];
- while (v > 0) {
- pos[v] = pos[v - 1];
- v--;
- }
- pos[0] = tmp;
- selector[i] = tmp;
- }
- }
-
- /* Now the coding tables */
- for (t = 0; t < nGroups; t++) {
- int curr = bsR(5);
- for (i = 0; i < alphaSize; i++) {
- while (bsR(1) == 1) {
- if (bsR(1) == 0) {
- curr++;
- } else {
- curr--;
- }
- }
- len[t][i] = (char) curr;
- }
- }
-
- /* Create the Huffman decoding tables */
- for (t = 0; t < nGroups; t++) {
- minLen = 32;
- maxLen = 0;
- for (i = 0; i < alphaSize; i++) {
- if (len[t][i] > maxLen) {
- maxLen = len[t][i];
- }
- if (len[t][i] < minLen) {
- minLen = len[t][i];
- }
- }
- hbCreateDecodeTables(limit[t], base[t], perm[t], len[t], minLen, maxLen, alphaSize);
- minLens[t] = minLen;
- }
- }
-
- private void getAndMoveToFrontDecode() throws IOException {
- char[] yy = new char[256];
- int i, j, nextSym, limitLast;
- int EOB, groupNo, groupPos;
-
- limitLast = baseBlockSize * blockSize100k;
- origPtr = bsGetIntVS(24);
-
- recvDecodingTables();
- EOB = nInUse + 1;
- groupNo = -1;
- groupPos = 0;
-
- /*
- * Setting up the unzftab entries here is not strictly necessary, but it
- * does save having to do it later in a separate pass, and so saves a
- * block's worth of cache misses.
- */
- for (i = 0; i <= 255; i++) {
- unzftab[i] = 0;
- }
-
- for (i = 0; i <= 255; i++) {
- yy[i] = (char) i;
- }
-
- last = -1;
-
- {
- int zt, zn, zvec, zj;
- if (groupPos == 0) {
- groupNo++;
- groupPos = G_SIZE;
- }
- groupPos--;
- zt = selector[groupNo];
- zn = minLens[zt];
- zvec = bsR(zn);
- while (zvec > limit[zt][zn]) {
- zn++;
- {
- {
- while (bsLive < 1) {
- int zzi = 0;
- try {
- zzi = readBs();
- } catch (IOException e) {
- compressedStreamEOF();
- }
- if (zzi == -1) {
- compressedStreamEOF();
- }
- bsBuff = (bsBuff << 8) | (zzi & 0xff);
- bsLive += 8;
- }
- }
- zj = (bsBuff >> (bsLive - 1)) & 1;
- bsLive--;
- }
- zvec = (zvec << 1) | zj;
- }
- nextSym = perm[zt][zvec - base[zt][zn]];
- }
-
- while (true) {
-
- if (nextSym == EOB) {
- break;
- }
-
- if (nextSym == RUNA || nextSym == RUNB) {
- char ch;
- int s = -1;
- int N = 1;
- do {
- if (nextSym == RUNA) {
- s = s + (0 + 1) * N;
- } else if (nextSym == RUNB) {
- s = s + (1 + 1) * N;
- }
- N = N * 2;
- {
- int zt, zn, zvec, zj;
- if (groupPos == 0) {
- groupNo++;
- groupPos = G_SIZE;
- }
- groupPos--;
- zt = selector[groupNo];
- zn = minLens[zt];
- zvec = bsR(zn);
- while (zvec > limit[zt][zn]) {
- zn++;
- {
- {
- while (bsLive < 1) {
- int zzi = 0;
- try {
- zzi = readBs();
- } catch (IOException e) {
- compressedStreamEOF();
- }
- if (zzi == -1) {
- compressedStreamEOF();
- }
- bsBuff = (bsBuff << 8) | (zzi & 0xff);
- bsLive += 8;
- }
- }
- zj = (bsBuff >> (bsLive - 1)) & 1;
- bsLive--;
- }
- zvec = (zvec << 1) | zj;
- }
- nextSym = perm[zt][zvec - base[zt][zn]];
- }
- } while (nextSym == RUNA || nextSym == RUNB);
-
- s++;
- ch = seqToUnseq[yy[0]];
- unzftab[ch] += s;
-
- while (s > 0) {
- last++;
- ll8[last] = ch;
- s--;
- }
-
- if (last >= limitLast) {
- blockOverrun();
- }
- continue;
- } else {
- char tmp;
- last++;
- if (last >= limitLast) {
- blockOverrun();
- }
-
- tmp = yy[nextSym - 1];
- unzftab[seqToUnseq[tmp]]++;
- ll8[last] = seqToUnseq[tmp];
-
- /*
- * This loop is hammered during decompression, hence the unrolling.
- *
- * for (j = nextSym-1; j > 0; j--) yy[j] = yy[j-1];
- */
-
- j = nextSym - 1;
- for (; j > 3; j -= 4) {
- yy[j] = yy[j - 1];
- yy[j - 1] = yy[j - 2];
- yy[j - 2] = yy[j - 3];
- yy[j - 3] = yy[j - 4];
- }
- for (; j > 0; j--) {
- yy[j] = yy[j - 1];
- }
-
- yy[0] = tmp;
- {
- int zt, zn, zvec, zj;
- if (groupPos == 0) {
- groupNo++;
- groupPos = G_SIZE;
- }
- groupPos--;
- zt = selector[groupNo];
- zn = minLens[zt];
- zvec = bsR(zn);
- while (zvec > limit[zt][zn]) {
- zn++;
- {
- {
- while (bsLive < 1) {
- int zzi;
- char thech = 0;
- try {
- thech = (char) readBs();
- } catch (IOException e) {
- compressedStreamEOF();
- }
- zzi = thech;
- bsBuff = (bsBuff << 8) | (zzi & 0xff);
- bsLive += 8;
- }
- }
- zj = (bsBuff >> (bsLive - 1)) & 1;
- bsLive--;
- }
- zvec = (zvec << 1) | zj;
- }
- nextSym = perm[zt][zvec - base[zt][zn]];
- }
- continue;
- }
- }
- }
-
- private void setupBlock() throws IOException {
- int[] cftab = new int[257];
- char ch;
-
- cftab[0] = 0;
- for (i = 1; i <= 256; i++) {
- cftab[i] = unzftab[i - 1];
- }
- for (i = 1; i <= 256; i++) {
- cftab[i] += cftab[i - 1];
- }
-
- for (i = 0; i <= last; i++) {
- ch = ll8[i];
- tt[cftab[ch]] = i;
- cftab[ch]++;
- }
- cftab = null;
-
- tPos = tt[origPtr];
-
- count = 0;
- i2 = 0;
- ch2 = 256; /* not a char and not EOF */
-
- if (blockRandomised) {
- rNToGo = 0;
- rTPos = 0;
- setupRandPartA();
- } else {
- setupNoRandPartA();
- }
- }
-
- private void setupRandPartA() throws IOException {
- if (i2 <= last) {
- chPrev = ch2;
- ch2 = ll8[tPos];
- tPos = tt[tPos];
- if (rNToGo == 0) {
- rNToGo = rNums[rTPos];
- rTPos++;
- if (rTPos == 512) {
- rTPos = 0;
- }
- }
- rNToGo--;
- ch2 ^= ((rNToGo == 1) ? 1 : 0);
- i2++;
-
- currentChar = ch2;
- currentState = RAND_PART_B_STATE;
- mCrc.updateCRC(ch2);
- } else {
- endBlock();
- initBlock(false);
- setupBlock();
- }
- }
-
- private void setupNoRandPartA() throws IOException {
- if (i2 <= last) {
- chPrev = ch2;
- ch2 = ll8[tPos];
- tPos = tt[tPos];
- i2++;
-
- currentChar = ch2;
- currentState = NO_RAND_PART_B_STATE;
- mCrc.updateCRC(ch2);
- } else {
- endBlock();
- initBlock(false);
- setupBlock();
- }
- }
-
- private void setupRandPartB() throws IOException {
- if (ch2 != chPrev) {
- currentState = RAND_PART_A_STATE;
- count = 1;
- setupRandPartA();
- } else {
- count++;
- if (count >= 4) {
- z = ll8[tPos];
- tPos = tt[tPos];
- if (rNToGo == 0) {
- rNToGo = rNums[rTPos];
- rTPos++;
- if (rTPos == 512) {
- rTPos = 0;
- }
- }
- rNToGo--;
- z ^= ((rNToGo == 1) ? 1 : 0);
- j2 = 0;
- currentState = RAND_PART_C_STATE;
- setupRandPartC();
- } else {
- currentState = RAND_PART_A_STATE;
- setupRandPartA();
- }
- }
- }
-
- private void setupRandPartC() throws IOException {
- if (j2 < (int) z) {
- currentChar = ch2;
- mCrc.updateCRC(ch2);
- j2++;
- } else {
- currentState = RAND_PART_A_STATE;
- i2++;
- count = 0;
- setupRandPartA();
- }
- }
-
- private void setupNoRandPartB() throws IOException {
- if (ch2 != chPrev) {
- currentState = NO_RAND_PART_A_STATE;
- count = 1;
- setupNoRandPartA();
- } else {
- count++;
- if (count >= 4) {
- z = ll8[tPos];
- tPos = tt[tPos];
- currentState = NO_RAND_PART_C_STATE;
- j2 = 0;
- setupNoRandPartC();
- } else {
- currentState = NO_RAND_PART_A_STATE;
- setupNoRandPartA();
- }
- }
- }
-
- private void setupNoRandPartC() throws IOException {
- if (j2 < (int) z) {
- currentChar = ch2;
- mCrc.updateCRC(ch2);
- j2++;
- } else {
- currentState = NO_RAND_PART_A_STATE;
- i2++;
- count = 0;
- setupNoRandPartA();
- }
- }
-
- private void setDecompressStructureSizes(int newSize100k) {
- if (!(0 <= newSize100k && newSize100k <= 9 && 0 <= blockSize100k && blockSize100k <= 9)) {
- // throw new IOException("Invalid block size");
- }
-
- blockSize100k = newSize100k;
-
- if (newSize100k == 0) {
- return;
- }
-
- int n = baseBlockSize * newSize100k;
- ll8 = new char[n];
- tt = new int[n];
- }
-
- private static class CRC {
- public static int crc32Table[] = { 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b,
- 0x1a864db2, 0x1e475005, 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, 0x350c9b64, 0x31cd86d3, 0x3c8ea00a,
- 0x384fbdbd, 0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
- 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd, 0x9823b6e0,
- 0x9ce2ab57, 0x91a18d8e, 0x95609039, 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, 0xbe2b5b58, 0xbaea46ef,
- 0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d, 0xd4326d90, 0xd0f37027, 0xddb056fe,
- 0xd9714b49, 0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95, 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1,
- 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, 0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae, 0x278206ab,
- 0x23431b1c, 0x2e003dc5, 0x2ac12072, 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4,
- 0x0808d07d, 0x0cc9cdca, 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde, 0x6b93dddb, 0x6f52c06c, 0x6211e6b5,
- 0x66d0fb02, 0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
- 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692, 0x8aad2b2f,
- 0x8e6c3698, 0x832f1041, 0x87ee0df6, 0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a, 0xe0b41de7, 0xe4750050,
- 0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2, 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31,
- 0xcbffd686, 0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a, 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637,
- 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb, 0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f, 0x5c007b8a,
- 0x58c1663d, 0x558240e4, 0x51435d53, 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5,
- 0x3f9b762c, 0x3b5a6b9b, 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff, 0x1011a0fa, 0x14d0bd4d, 0x19939b94,
- 0x1d528623, 0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
- 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3, 0xbd3e8d7e,
- 0xb9ff90c9, 0xb4bcb610, 0xb07daba7, 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, 0x9b3660c6, 0x9ff77d71,
- 0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3, 0x5d8a9099, 0x594b8d2e, 0x5408abf7,
- 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c, 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8,
- 0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24, 0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30, 0x029f3d35,
- 0x065e2082, 0x0b1d065b, 0x0fdc1bec, 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a,
- 0x2d15ebe3, 0x29d4f654, 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0, 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb,
- 0xdbee767c, 0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
- 0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c, 0xafb010b1,
- 0xab710d06, 0xa6322bdf, 0xa2f33668, 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4 };
-
- public CRC() {
- initialiseCRC();
- }
-
- void initialiseCRC() {
- globalCrc = 0xffffffff;
- }
-
- int getFinalCRC() {
- return ~globalCrc;
- }
-
- void updateCRC(int inCh) {
- int temp = (globalCrc >> 24) ^ inCh;
- if (temp < 0) {
- temp = 256 + temp;
- }
- globalCrc = (globalCrc << 8) ^ CRC.crc32Table[temp];
- }
-
- int globalCrc;
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/text/LineParser.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/text/LineParser.java b/crunch/src/main/java/org/apache/crunch/io/text/LineParser.java
deleted file mode 100644
index 9438014..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/text/LineParser.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.text;
-
-import java.util.Iterator;
-import java.util.List;
-import java.util.StringTokenizer;
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Pair;
-import org.apache.crunch.fn.CompositeMapFn;
-import org.apache.crunch.fn.IdentityFn;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-
-import com.google.common.base.Splitter;
-import com.google.common.collect.ImmutableList;
-
-/**
- * An abstraction for parsing the lines of a text file using a {@code PType<T>} to
- * convert the lines of text into a given data type.
- *
- * @param <T> The type returned by the text parsing
- */
-abstract class LineParser<T> {
-
- public static <S> LineParser<S> forType(PType<S> ptype) {
- return new SimpleLineParser<S>(ptype);
- }
-
- public static <K, V> LineParser<Pair<K, V>> forTableType(PTableType<K, V> ptt, String sep) {
- return new KeyValueLineParser<K, V>(ptt, sep);
- }
-
- private MapFn<String, T> mapFn;
-
- public void initialize() {
- mapFn = getMapFn();
- mapFn.initialize();
- }
-
- public T parse(String line) {
- return mapFn.map(line);
- }
-
- protected abstract MapFn<String, T> getMapFn();
-
- private static <T> MapFn<String, T> getMapFnForPType(PType<T> ptype) {
- MapFn ret = null;
- if (String.class.equals(ptype.getTypeClass())) {
- ret = (MapFn) IdentityFn.getInstance();
- } else {
- // Check for a composite MapFn for the PType.
- // Note that this won't work for Avro-- need to solve that.
- ret = ptype.getInputMapFn();
- if (ret instanceof CompositeMapFn) {
- ret = ((CompositeMapFn) ret).getSecond();
- }
- }
- return ret;
- }
-
- private static class SimpleLineParser<S> extends LineParser<S> {
-
- private final PType<S> ptype;
-
- public SimpleLineParser(PType<S> ptype) {
- this.ptype = ptype;
- }
-
- @Override
- protected MapFn<String, S> getMapFn() {
- return getMapFnForPType(ptype);
- }
- }
-
- private static class KeyValueLineParser<K, V> extends LineParser<Pair<K, V>> {
-
- private final PTableType<K, V> ptt;
- private final String sep;
-
- public KeyValueLineParser(PTableType<K, V> ptt, String sep) {
- this.ptt = ptt;
- this.sep = sep;
- }
-
- @Override
- protected MapFn<String, Pair<K, V>> getMapFn() {
- final MapFn<String, K> keyMapFn = getMapFnForPType(ptt.getKeyType());
- final MapFn<String, V> valueMapFn = getMapFnForPType(ptt.getValueType());
-
- return new MapFn<String, Pair<K, V>>() {
- @Override
- public void initialize() {
- keyMapFn.initialize();
- valueMapFn.initialize();
- }
-
- @Override
- public Pair<K, V> map(String input) {
- List<String> kv = ImmutableList.copyOf(Splitter.on(sep).limit(1).split(input));
- if (kv.size() != 2) {
- throw new RuntimeException("Invalid input string: " + input);
- }
- return Pair.of(keyMapFn.map(kv.get(0)), valueMapFn.map(kv.get(1)));
- }
- };
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/text/NLineFileSource.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/text/NLineFileSource.java b/crunch/src/main/java/org/apache/crunch/io/text/NLineFileSource.java
deleted file mode 100644
index 40e2dbd..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/text/NLineFileSource.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.text;
-
-import java.io.IOException;
-
-import org.apache.crunch.io.CompositePathIterable;
-import org.apache.crunch.io.FormatBundle;
-import org.apache.crunch.io.ReadableSource;
-import org.apache.crunch.io.impl.FileSourceImpl;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
-
-/**
- * A {@code Source} instance that uses the {@code NLineInputFormat}, which gives each map
- * task a fraction of the lines in a text file as input. Most useful when running simulations
- * on Hadoop, where each line represents configuration information about each simulation
- * run.
- */
-public class NLineFileSource<T> extends FileSourceImpl<T> implements ReadableSource<T> {
-
- private static FormatBundle getBundle(int linesPerTask) {
- FormatBundle bundle = FormatBundle.forInput(NLineInputFormat.class);
- bundle.set(NLineInputFormat.LINES_PER_MAP, String.valueOf(linesPerTask));
- return bundle;
- }
-
- /**
- * Create a new {@code NLineFileSource} instance.
- *
- * @param path The path to the input data, as a String
- * @param ptype The PType to use for processing the data
- * @param linesPerTask The number of lines from the input each map task will process
- */
- public NLineFileSource(String path, PType<T> ptype, int linesPerTask) {
- this(new Path(path), ptype, linesPerTask);
- }
-
- /**
- * Create a new {@code NLineFileSource} instance.
- *
- * @param path The {@code Path} to the input data
- * @param ptype The PType to use for processing the data
- * @param linesPerTask The number of lines from the input each map task will process
- */
- public NLineFileSource(Path path, PType<T> ptype, int linesPerTask) {
- super(path, ptype, getBundle(linesPerTask));
- }
-
- @Override
- public String toString() {
- return "NLine(" + path + ")";
- }
-
- @Override
- public Iterable<T> read(Configuration conf) throws IOException {
- return CompositePathIterable.create(path.getFileSystem(conf), path,
- new TextFileReaderFactory<T>(LineParser.forType(ptype)));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/text/TextFileReaderFactory.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/text/TextFileReaderFactory.java b/crunch/src/main/java/org/apache/crunch/io/text/TextFileReaderFactory.java
deleted file mode 100644
index e1fea6e..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/text/TextFileReaderFactory.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.text;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.Iterator;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.io.FileReaderFactory;
-import org.apache.crunch.io.impl.AutoClosingIterator;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-
-import com.google.common.collect.Iterators;
-import com.google.common.collect.UnmodifiableIterator;
-
-public class TextFileReaderFactory<T> implements FileReaderFactory<T> {
-
- private static final Log LOG = LogFactory.getLog(TextFileReaderFactory.class);
-
- private final LineParser<T> parser;
-
- public TextFileReaderFactory(PType<T> ptype) {
- this(LineParser.forType(ptype));
- }
-
- public TextFileReaderFactory(LineParser<T> parser) {
- this.parser = parser;
- }
-
- @Override
- public Iterator<T> read(FileSystem fs, Path path) {
- parser.initialize();
-
- FSDataInputStream is;
- try {
- is = fs.open(path);
- } catch (IOException e) {
- LOG.info("Could not read path: " + path, e);
- return Iterators.emptyIterator();
- }
-
- final BufferedReader reader = new BufferedReader(new InputStreamReader(is));
- return new AutoClosingIterator<T>(reader, new UnmodifiableIterator<T>() {
- private String nextLine;
-
- @Override
- public boolean hasNext() {
- try {
- return (nextLine = reader.readLine()) != null;
- } catch (IOException e) {
- LOG.info("Exception reading text file stream", e);
- return false;
- }
- }
-
- @Override
- public T next() {
- return parser.parse(nextLine);
- }
- });
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/text/TextFileSource.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/text/TextFileSource.java b/crunch/src/main/java/org/apache/crunch/io/text/TextFileSource.java
deleted file mode 100644
index 026fca9..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/text/TextFileSource.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.text;
-
-import java.io.IOException;
-
-import org.apache.crunch.io.CompositePathIterable;
-import org.apache.crunch.io.ReadableSource;
-import org.apache.crunch.io.impl.FileSourceImpl;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.avro.AvroUtf8InputFormat;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
-
-public class TextFileSource<T> extends FileSourceImpl<T> implements ReadableSource<T> {
-
- private static boolean isBZip2(Path path) {
- String strPath = path.toString();
- return strPath.endsWith(".bz") || strPath.endsWith(".bz2");
- }
-
- private static <S> Class<? extends FileInputFormat<?, ?>> getInputFormat(Path path, PType<S> ptype) {
- if (ptype.getFamily().equals(AvroTypeFamily.getInstance())) {
- return AvroUtf8InputFormat.class;
- } else if (isBZip2(path)) {
- return BZip2TextInputFormat.class;
- } else {
- return TextInputFormat.class;
- }
- }
-
- public TextFileSource(Path path, PType<T> ptype) {
- super(path, ptype, getInputFormat(path, ptype));
- }
-
- @Override
- public long getSize(Configuration conf) {
- long sz = super.getSize(conf);
- if (isBZip2(path)) {
- sz *= 10; // Arbitrary compression factor
- }
- return sz;
- }
-
- @Override
- public String toString() {
- return "Text(" + path + ")";
- }
-
- @Override
- public Iterable<T> read(Configuration conf) throws IOException {
- return CompositePathIterable.create(path.getFileSystem(conf), path,
- new TextFileReaderFactory<T>(LineParser.forType(ptype)));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/text/TextFileSourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/text/TextFileSourceTarget.java b/crunch/src/main/java/org/apache/crunch/io/text/TextFileSourceTarget.java
deleted file mode 100644
index 1d1211e..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/text/TextFileSourceTarget.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.text;
-
-import org.apache.crunch.io.FileNamingScheme;
-import org.apache.crunch.io.SequentialFileNamingScheme;
-import org.apache.crunch.io.impl.ReadableSourcePathTargetImpl;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.fs.Path;
-
-public class TextFileSourceTarget<T> extends ReadableSourcePathTargetImpl<T> {
-
- public TextFileSourceTarget(String path, PType<T> ptype) {
- this(new Path(path), ptype);
- }
-
- public TextFileSourceTarget(Path path, PType<T> ptype) {
- this(path, ptype, new SequentialFileNamingScheme());
- }
-
- public TextFileSourceTarget(Path path, PType<T> ptype, FileNamingScheme fileNamingScheme) {
- super(new TextFileSource<T>(path, ptype), new TextFileTarget(path), fileNamingScheme);
- }
-
- @Override
- public String toString() {
- return target.toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/text/TextFileTableSource.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/text/TextFileTableSource.java b/crunch/src/main/java/org/apache/crunch/io/text/TextFileTableSource.java
deleted file mode 100644
index 94fc5fd..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/text/TextFileTableSource.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.text;
-
-import java.io.IOException;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.io.CompositePathIterable;
-import org.apache.crunch.io.FormatBundle;
-import org.apache.crunch.io.ReadableSource;
-import org.apache.crunch.io.impl.FileTableSourceImpl;
-import org.apache.crunch.types.PTableType;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
-
-/**
- * A {@code Source} that uses the {@code KeyValueTextInputFormat} to process
- * input text. If a separator for the keys and values in the text file is not specified,
- * a tab character is used.
- */
-public class TextFileTableSource<K, V> extends FileTableSourceImpl<K, V>
- implements ReadableSource<Pair<K, V>> {
-
- // CRUNCH-125: Maintain compatibility with both versions of the KeyValueTextInputFormat's
- // configuration field for specifying the separator character.
- private static final String OLD_KV_SEP = "key.value.separator.in.input.line";
- private static final String NEW_KV_SEP = "mapreduce.input.keyvaluelinerecordreader.key.value.separator";
-
- private static FormatBundle getBundle(String sep) {
- FormatBundle bundle = FormatBundle.forInput(KeyValueTextInputFormat.class);
- bundle.set(OLD_KV_SEP, sep);
- bundle.set(NEW_KV_SEP, sep);
- return bundle;
- }
-
- private final String separator;
-
- public TextFileTableSource(String path, PTableType<K, V> tableType) {
- this(new Path(path), tableType);
- }
-
- public TextFileTableSource(Path path, PTableType<K, V> tableType) {
- this(path, tableType, "\t");
- }
-
- public TextFileTableSource(String path, PTableType<K, V> tableType, String separator) {
- this(new Path(path), tableType, separator);
- }
-
- public TextFileTableSource(Path path, PTableType<K, V> tableType, String separator) {
- super(path, tableType, getBundle(separator));
- this.separator = separator;
- }
-
- @Override
- public String toString() {
- return "KeyValueText(" + path + ")";
- }
-
- @Override
- public Iterable<Pair<K, V>> read(Configuration conf) throws IOException {
- return CompositePathIterable.create(path.getFileSystem(conf), path,
- new TextFileReaderFactory<Pair<K, V>>(LineParser.forTableType(getTableType(), separator)));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/text/TextFileTableSourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/text/TextFileTableSourceTarget.java b/crunch/src/main/java/org/apache/crunch/io/text/TextFileTableSourceTarget.java
deleted file mode 100644
index dec97e5..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/text/TextFileTableSourceTarget.java
+++ /dev/null
@@ -1,63 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.text;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.TableSourceTarget;
-import org.apache.crunch.io.FileNamingScheme;
-import org.apache.crunch.io.SequentialFileNamingScheme;
-import org.apache.crunch.io.impl.ReadableSourcePathTargetImpl;
-import org.apache.crunch.types.PTableType;
-import org.apache.hadoop.fs.Path;
-
-/**
- * A {@code TableSource} and {@code SourceTarget} implementation that uses the
- * {@code KeyValueTextInputFormat} and {@code TextOutputFormat} to support reading
- * and writing text files as {@code PTable} instances using a tab separator for
- * the keys and the values.
- */
-public class TextFileTableSourceTarget<K, V> extends ReadableSourcePathTargetImpl<Pair<K, V>> implements
- TableSourceTarget<K, V> {
-
- private final PTableType<K, V> tableType;
-
- public TextFileTableSourceTarget(String path, PTableType<K, V> tableType) {
- this(new Path(path), tableType);
- }
-
- public TextFileTableSourceTarget(Path path, PTableType<K, V> tableType) {
- this(path, tableType, new SequentialFileNamingScheme());
- }
-
- public TextFileTableSourceTarget(Path path, PTableType<K, V> tableType,
- FileNamingScheme fileNamingScheme) {
- super(new TextFileTableSource<K, V>(path, tableType), new TextFileTarget(path),
- fileNamingScheme);
- this.tableType = tableType;
- }
-
- @Override
- public PTableType<K, V> getTableType() {
- return tableType;
- }
-
- @Override
- public String toString() {
- return target.toString();
- }
-}
[23/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/lib/join/BrokenLeftAndOuterJoinTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/lib/join/BrokenLeftAndOuterJoinTest.java b/crunch-core/src/test/java/org/apache/crunch/lib/join/BrokenLeftAndOuterJoinTest.java
new file mode 100644
index 0000000..7e2e444
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/lib/join/BrokenLeftAndOuterJoinTest.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import static org.apache.crunch.test.StringWrapper.wrap;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.verifyNoMoreInteractions;
+
+import java.util.List;
+
+import org.apache.crunch.Emitter;
+import org.apache.crunch.Pair;
+import org.apache.crunch.test.CrunchTestSupport;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class BrokenLeftAndOuterJoinTest {
+
+ List<Pair<StringWrapper, String>> createValuePairList(StringWrapper leftValue, String rightValue) {
+ Pair<StringWrapper, String> valuePair = Pair.of(leftValue, rightValue);
+ List<Pair<StringWrapper, String>> valuePairList = Lists.newArrayList();
+ valuePairList.add(valuePair);
+ return valuePairList;
+ }
+
+ @Test
+ public void testOuterJoin() {
+ JoinFn<StringWrapper, StringWrapper, String> joinFn = new LeftOuterJoinFn<StringWrapper, StringWrapper, String>(
+ Avros.reflects(StringWrapper.class),
+ Avros.reflects(StringWrapper.class));
+ joinFn.setContext(CrunchTestSupport.getTestContext(new Configuration()));
+ joinFn.initialize();
+ Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> emitter = mock(Emitter.class);
+
+ StringWrapper key = new StringWrapper();
+ StringWrapper leftValue = new StringWrapper();
+ key.setValue("left-only");
+ leftValue.setValue("left-only-left");
+ joinFn.join(key, 0, createValuePairList(leftValue, null), emitter);
+
+ key.setValue("right-only");
+ joinFn.join(key, 1, createValuePairList(null, "right-only-right"), emitter);
+
+ verify(emitter).emit(Pair.of(wrap("left-only"), Pair.of(wrap("left-only-left"), (String) null)));
+ verifyNoMoreInteractions(emitter);
+ }
+
+ @Test
+ public void testFullJoin() {
+ JoinFn<StringWrapper, StringWrapper, String> joinFn = new FullOuterJoinFn<StringWrapper, StringWrapper, String>(
+ Avros.reflects(StringWrapper.class),
+ Avros.reflects(StringWrapper.class));
+ joinFn.setContext(CrunchTestSupport.getTestContext(new Configuration()));
+ joinFn.initialize();
+ Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> emitter = mock(Emitter.class);
+
+ StringWrapper key = new StringWrapper();
+ StringWrapper leftValue = new StringWrapper();
+ key.setValue("left-only");
+ leftValue.setValue("left-only-left");
+ joinFn.join(key, 0, createValuePairList(leftValue, null), emitter);
+
+ key.setValue("right-only");
+ joinFn.join(key, 1, createValuePairList(null, "right-only-right"), emitter);
+
+ verify(emitter).emit(Pair.of(wrap("left-only"), Pair.of(wrap("left-only-left"), (String) null)));
+ verify(emitter).emit(Pair.of(wrap("right-only"), Pair.of((StringWrapper)null, "right-only-right")));
+ verifyNoMoreInteractions(emitter);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/lib/join/FullOuterJoinFnTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/lib/join/FullOuterJoinFnTest.java b/crunch-core/src/test/java/org/apache/crunch/lib/join/FullOuterJoinFnTest.java
new file mode 100644
index 0000000..5cf4f51
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/lib/join/FullOuterJoinFnTest.java
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import static org.apache.crunch.test.StringWrapper.wrap;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.verifyNoMoreInteractions;
+
+import org.apache.crunch.Emitter;
+import org.apache.crunch.Pair;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.types.avro.Avros;
+
+public class FullOuterJoinFnTest extends JoinFnTestBase {
+
+ @Override
+ protected void checkOutput(Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> emitter) {
+ verify(emitter)
+ .emit(Pair.of(wrap("left-only"), Pair.of(wrap("left-only-left"), (String) null)));
+ verify(emitter).emit(Pair.of(wrap("both"), Pair.of(wrap("both-left"), "both-right")));
+ verify(emitter).emit(
+ Pair.of(wrap("right-only"), Pair.of((StringWrapper) null, "right-only-right")));
+ verifyNoMoreInteractions(emitter);
+ }
+
+ @Override
+ protected JoinFn<StringWrapper, StringWrapper, String> getJoinFn() {
+ return new FullOuterJoinFn<StringWrapper, StringWrapper, String>(
+ Avros.reflects(StringWrapper.class),
+ Avros.reflects(StringWrapper.class));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/lib/join/InnerJoinFnTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/lib/join/InnerJoinFnTest.java b/crunch-core/src/test/java/org/apache/crunch/lib/join/InnerJoinFnTest.java
new file mode 100644
index 0000000..d2347de
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/lib/join/InnerJoinFnTest.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import static org.apache.crunch.test.StringWrapper.wrap;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.verifyNoMoreInteractions;
+
+import org.apache.crunch.Emitter;
+import org.apache.crunch.Pair;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.types.avro.Avros;
+
+public class InnerJoinFnTest extends JoinFnTestBase {
+
+ protected void checkOutput(Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> joinEmitter) {
+ verify(joinEmitter).emit(Pair.of(wrap("both"), Pair.of(wrap("both-left"), "both-right")));
+ verifyNoMoreInteractions(joinEmitter);
+ }
+
+ @Override
+ protected JoinFn<StringWrapper, StringWrapper, String> getJoinFn() {
+ return new InnerJoinFn<StringWrapper, StringWrapper, String>(
+ Avros.reflects(StringWrapper.class),
+ Avros.reflects(StringWrapper.class));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/lib/join/JoinFnTestBase.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/lib/join/JoinFnTestBase.java b/crunch-core/src/test/java/org/apache/crunch/lib/join/JoinFnTestBase.java
new file mode 100644
index 0000000..9e4337f
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/lib/join/JoinFnTestBase.java
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import static org.mockito.Mockito.mock;
+
+import java.util.List;
+
+import org.apache.crunch.Emitter;
+import org.apache.crunch.Pair;
+import org.apache.crunch.test.CrunchTestSupport;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public abstract class JoinFnTestBase {
+
+ private JoinFn<StringWrapper, StringWrapper, String> joinFn;
+
+ private Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> emitter;
+
+ // Avoid warnings on generic Emitter mock
+ @SuppressWarnings("unchecked")
+ @Before
+ public void setUp() {
+ joinFn = getJoinFn();
+ joinFn.setContext(CrunchTestSupport.getTestContext(new Configuration()));
+ joinFn.initialize();
+ emitter = mock(Emitter.class);
+ }
+
+ @Test
+ public void testJoin() {
+
+ StringWrapper key = new StringWrapper();
+ StringWrapper leftValue = new StringWrapper();
+ key.setValue("left-only");
+ leftValue.setValue("left-only-left");
+ joinFn.join(key, 0, createValuePairList(leftValue, null), emitter);
+
+ key.setValue("both");
+ leftValue.setValue("both-left");
+ joinFn.join(key, 0, createValuePairList(leftValue, null), emitter);
+ joinFn.join(key, 1, createValuePairList(null, "both-right"), emitter);
+
+ key.setValue("right-only");
+ joinFn.join(key, 1, createValuePairList(null, "right-only-right"), emitter);
+
+ checkOutput(emitter);
+
+ }
+
+ protected abstract void checkOutput(Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> emitter);
+
+ protected abstract JoinFn<StringWrapper, StringWrapper, String> getJoinFn();
+
+ protected List<Pair<StringWrapper, String>> createValuePairList(StringWrapper leftValue, String rightValue) {
+ Pair<StringWrapper, String> valuePair = Pair.of(leftValue, rightValue);
+ List<Pair<StringWrapper, String>> valuePairList = Lists.newArrayList();
+ valuePairList.add(valuePair);
+ return valuePairList;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/lib/join/LeftOuterJoinTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/lib/join/LeftOuterJoinTest.java b/crunch-core/src/test/java/org/apache/crunch/lib/join/LeftOuterJoinTest.java
new file mode 100644
index 0000000..a90457e
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/lib/join/LeftOuterJoinTest.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import static org.apache.crunch.test.StringWrapper.wrap;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.verifyNoMoreInteractions;
+
+import org.apache.crunch.Emitter;
+import org.apache.crunch.Pair;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.types.avro.Avros;
+
+public class LeftOuterJoinTest extends JoinFnTestBase {
+
+ @Override
+ protected void checkOutput(Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> emitter) {
+ verify(emitter)
+ .emit(Pair.of(wrap("left-only"), Pair.of(wrap("left-only-left"), (String) null)));
+ verify(emitter).emit(Pair.of(wrap("both"), Pair.of(wrap("both-left"), "both-right")));
+ verifyNoMoreInteractions(emitter);
+ }
+
+ @Override
+ protected JoinFn<StringWrapper, StringWrapper, String> getJoinFn() {
+ return new LeftOuterJoinFn<StringWrapper, StringWrapper, String>(
+ Avros.reflects(StringWrapper.class),
+ Avros.reflects(StringWrapper.class));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/lib/join/RightOuterJoinFnTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/lib/join/RightOuterJoinFnTest.java b/crunch-core/src/test/java/org/apache/crunch/lib/join/RightOuterJoinFnTest.java
new file mode 100644
index 0000000..7e41284
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/lib/join/RightOuterJoinFnTest.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib.join;
+
+import static org.apache.crunch.test.StringWrapper.wrap;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.verifyNoMoreInteractions;
+
+import org.apache.crunch.Emitter;
+import org.apache.crunch.Pair;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.types.avro.Avros;
+
+public class RightOuterJoinFnTest extends JoinFnTestBase {
+
+ @Override
+ protected void checkOutput(Emitter<Pair<StringWrapper, Pair<StringWrapper, String>>> emitter) {
+ verify(emitter).emit(Pair.of(wrap("both"), Pair.of(wrap("both-left"), "both-right")));
+ verify(emitter).emit(
+ Pair.of(wrap("right-only"), Pair.of((StringWrapper) null, "right-only-right")));
+ verifyNoMoreInteractions(emitter);
+ }
+
+ @Override
+ protected JoinFn<StringWrapper, StringWrapper, String> getJoinFn() {
+ return new RightOuterJoinFn<StringWrapper, StringWrapper, String>(
+ Avros.reflects(StringWrapper.class),
+ Avros.reflects(StringWrapper.class));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/test/CountersTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/test/CountersTest.java b/crunch-core/src/test/java/org/apache/crunch/test/CountersTest.java
new file mode 100644
index 0000000..66f854e
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/test/CountersTest.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.test;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+
+/**
+ * A test to verify using counters inside of a unit test works. :)
+ */
+public class CountersTest {
+
+ public enum CT {
+ ONE,
+ TWO,
+ THREE
+ };
+
+ public static class CTFn extends DoFn<String, String> {
+ CTFn() {
+ setContext(CrunchTestSupport.getTestContext(new Configuration()));
+ }
+
+ @Override
+ public void process(String input, Emitter<String> emitter) {
+ getCounter(CT.ONE).increment(1);
+ getCounter(CT.TWO).increment(4);
+ getCounter(CT.THREE).increment(7);
+ }
+ }
+
+ @Test
+ public void test() {
+ CTFn fn = new CTFn();
+ fn.process("foo", null);
+ fn.process("bar", null);
+ assertEquals(2L, TestCounters.getCounter(CT.ONE).getValue());
+ assertEquals(8L, TestCounters.getCounter(CT.TWO).getValue());
+ assertEquals(14L, TestCounters.getCounter(CT.THREE).getValue());
+ }
+
+ @Test
+ public void secondTest() {
+ CTFn fn = new CTFn();
+ fn.process("foo", null);
+ fn.process("bar", null);
+ assertEquals(2L, TestCounters.getCounter(CT.ONE).getValue());
+ assertEquals(8L, TestCounters.getCounter(CT.TWO).getValue());
+ assertEquals(14L, TestCounters.getCounter(CT.THREE).getValue());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/test/StringWrapper.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/test/StringWrapper.java b/crunch-core/src/test/java/org/apache/crunch/test/StringWrapper.java
new file mode 100644
index 0000000..34302b5
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/test/StringWrapper.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.test;
+
+import org.apache.crunch.MapFn;
+
+/**
+ * Simple String wrapper for testing with Avro reflection.
+ */
+public class StringWrapper implements Comparable<StringWrapper> {
+
+ public static class StringToStringWrapperMapFn extends MapFn<String, StringWrapper> {
+
+ @Override
+ public StringWrapper map(String input) {
+ return wrap(input);
+ }
+
+ }
+
+ public static class StringWrapperToStringMapFn extends MapFn<StringWrapper, String> {
+
+ @Override
+ public String map(StringWrapper input) {
+ return input.getValue();
+ }
+
+ }
+
+ private String value;
+
+ public StringWrapper() {
+ this("");
+ }
+
+ public StringWrapper(String value) {
+ this.value = value;
+ }
+
+ @Override
+ public int compareTo(StringWrapper o) {
+ return this.value.compareTo(o.value);
+ }
+
+ public String getValue() {
+ return value;
+ }
+
+ public void setValue(String value) {
+ this.value = value;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((value == null) ? 0 : value.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ StringWrapper other = (StringWrapper) obj;
+ if (value == null) {
+ if (other.value != null)
+ return false;
+ } else if (!value.equals(other.value))
+ return false;
+ return true;
+ }
+
+ @Override
+ public String toString() {
+ return "StringWrapper [value=" + value + "]";
+ }
+
+ public static StringWrapper wrap(String value) {
+ return new StringWrapper(value);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/CollectionDeepCopierTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/CollectionDeepCopierTest.java b/crunch-core/src/test/java/org/apache/crunch/types/CollectionDeepCopierTest.java
new file mode 100644
index 0000000..bd7fcd7
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/CollectionDeepCopierTest.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertNull;
+
+import java.util.Collection;
+
+import org.apache.crunch.test.Person;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class CollectionDeepCopierTest {
+
+ @Test
+ public void testDeepCopy() {
+ Person person = new Person();
+ person.age = 42;
+ person.name = "John Smith";
+ person.siblingnames = Lists.<CharSequence> newArrayList();
+
+ Collection<Person> personCollection = Lists.newArrayList(person);
+ CollectionDeepCopier<Person> collectionDeepCopier = new CollectionDeepCopier<Person>(
+ Avros.records(Person.class));
+ collectionDeepCopier.initialize(new Configuration());
+
+ Collection<Person> deepCopyCollection = collectionDeepCopier.deepCopy(personCollection);
+
+ assertEquals(personCollection, deepCopyCollection);
+ assertNotSame(personCollection.iterator().next(), deepCopyCollection.iterator().next());
+ }
+
+ @Test
+ public void testNullDeepCopy() {
+ CollectionDeepCopier<Person> collectionDeepCopier = new CollectionDeepCopier<Person>(
+ Avros.records(Person.class));
+ collectionDeepCopier.initialize(new Configuration());
+ Collection<Person> nullCollection = null;
+ assertNull(collectionDeepCopier.deepCopy(nullCollection));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/MapDeepCopierTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/MapDeepCopierTest.java b/crunch-core/src/test/java/org/apache/crunch/types/MapDeepCopierTest.java
new file mode 100644
index 0000000..c13e4a2
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/MapDeepCopierTest.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertNull;
+
+import java.util.Map;
+
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+
+import com.google.common.collect.Maps;
+
+public class MapDeepCopierTest {
+
+ @Test
+ public void testDeepCopy() {
+ StringWrapper stringWrapper = new StringWrapper("value");
+ String key = "key";
+ Map<String, StringWrapper> map = Maps.newHashMap();
+ map.put(key, stringWrapper);
+
+ MapDeepCopier<StringWrapper> deepCopier = new MapDeepCopier<StringWrapper>(
+ Avros.reflects(StringWrapper.class));
+ deepCopier.initialize(new Configuration());
+ Map<String, StringWrapper> deepCopy = deepCopier.deepCopy(map);
+
+ assertEquals(map, deepCopy);
+ assertNotSame(map.get(key), deepCopy.get(key));
+ }
+
+ @Test
+ public void testDeepCopy_Null() {
+ Map<String, StringWrapper> map = null;
+
+ MapDeepCopier<StringWrapper> deepCopier = new MapDeepCopier<StringWrapper>(
+ Avros.reflects(StringWrapper.class));
+ deepCopier.initialize(new Configuration());
+ Map<String, StringWrapper> deepCopy = deepCopier.deepCopy(map);
+
+ assertNull(deepCopy);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/PTypeUtilsTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/PTypeUtilsTest.java b/crunch-core/src/test/java/org/apache/crunch/types/PTypeUtilsTest.java
new file mode 100644
index 0000000..e6fd90c
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/PTypeUtilsTest.java
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import java.util.Collection;
+
+import org.apache.avro.Schema;
+import org.apache.avro.util.Utf8;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.TupleN;
+import org.apache.crunch.types.avro.AvroType;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.crunch.types.writable.WritableTypeFamily;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.io.Text;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class PTypeUtilsTest {
+ @Test
+ public void testPrimitives() {
+ assertEquals(Avros.strings(), AvroTypeFamily.getInstance().as(Writables.strings()));
+ Assert.assertEquals(Writables.doubles(), WritableTypeFamily.getInstance().as(Avros.doubles()));
+ }
+
+ @Test
+ public void testTuple3() {
+ PType<Tuple3<String, Float, Integer>> t = Writables.triples(Writables.strings(), Writables.floats(),
+ Writables.ints());
+ PType<Tuple3<String, Float, Integer>> at = AvroTypeFamily.getInstance().as(t);
+ assertEquals(Avros.strings(), at.getSubTypes().get(0));
+ assertEquals(Avros.floats(), at.getSubTypes().get(1));
+ assertEquals(Avros.ints(), at.getSubTypes().get(2));
+ }
+
+ @Test
+ public void testTupleN() {
+ PType<TupleN> t = Avros.tuples(Avros.strings(), Avros.floats(), Avros.ints());
+ PType<TupleN> wt = WritableTypeFamily.getInstance().as(t);
+ assertEquals(Writables.strings(), wt.getSubTypes().get(0));
+ assertEquals(Writables.floats(), wt.getSubTypes().get(1));
+ assertEquals(Writables.ints(), wt.getSubTypes().get(2));
+ }
+
+ @Test
+ public void testWritableCollections() {
+ PType<Collection<String>> t = Avros.collections(Avros.strings());
+ t = WritableTypeFamily.getInstance().as(t);
+ assertEquals(Writables.strings(), t.getSubTypes().get(0));
+ }
+
+ @Test
+ public void testAvroCollections() {
+ PType<Collection<Double>> t = Writables.collections(Writables.doubles());
+ t = AvroTypeFamily.getInstance().as(t);
+ assertEquals(Avros.doubles(), t.getSubTypes().get(0));
+ }
+
+ @Test
+ public void testAvroRegistered() {
+ AvroType<Utf8> at = new AvroType<Utf8>(Utf8.class, Schema.create(Schema.Type.STRING), new DeepCopier.NoOpDeepCopier<Utf8>());
+ Avros.register(Utf8.class, at);
+ assertEquals(at, Avros.records(Utf8.class));
+ }
+
+ @Test
+ public void testWritableBuiltin() {
+ assertNotNull(Writables.records(Text.class));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/PTypesTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/PTypesTest.java b/crunch-core/src/test/java/org/apache/crunch/types/PTypesTest.java
new file mode 100644
index 0000000..d7c8811
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/PTypesTest.java
@@ -0,0 +1,34 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.UUID;
+
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.junit.Test;
+
+public class PTypesTest {
+ @Test
+ public void testUUID() throws Exception {
+ UUID uuid = UUID.randomUUID();
+ PType<UUID> ptype = PTypes.uuid(AvroTypeFamily.getInstance());
+ assertEquals(uuid, ptype.getInputMapFn().map(ptype.getOutputMapFn().map(uuid)));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/TupleDeepCopierTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/TupleDeepCopierTest.java b/crunch-core/src/test/java/org/apache/crunch/types/TupleDeepCopierTest.java
new file mode 100644
index 0000000..e46a680
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/TupleDeepCopierTest.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertNull;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class TupleDeepCopierTest {
+
+ @Test
+ public void testDeepCopy_Pair() {
+ Person person = new Person();
+ person.name = "John Doe";
+ person.age = 42;
+ person.siblingnames = Lists.<CharSequence> newArrayList();
+
+ Pair<Integer, Person> inputPair = Pair.of(1, person);
+ DeepCopier<Pair> deepCopier = new TupleDeepCopier<Pair>(Pair.class, Avros.ints(),
+ Avros.records(Person.class));
+
+ deepCopier.initialize(new Configuration());
+ Pair<Integer, Person> deepCopyPair = deepCopier.deepCopy(inputPair);
+
+ assertEquals(inputPair, deepCopyPair);
+ assertNotSame(inputPair.second(), deepCopyPair.second());
+ }
+
+ @Test
+ public void testDeepCopy_PairContainingNull() {
+
+ Pair<Integer, Person> inputPair = Pair.of(1, null);
+ DeepCopier<Pair> deepCopier = new TupleDeepCopier<Pair>(Pair.class, Avros.ints(),
+ Avros.records(Person.class));
+
+ deepCopier.initialize(new Configuration());
+ Pair<Integer, Person> deepCopyPair = deepCopier.deepCopy(inputPair);
+
+ assertEquals(inputPair, deepCopyPair);
+ }
+
+ @Test
+ public void testDeepCopy_NullPair() {
+ Pair<Integer, Person> inputPair = null;
+ DeepCopier<Pair> deepCopier = new TupleDeepCopier<Pair>(Pair.class, Avros.ints(),
+ Avros.records(Person.class));
+
+ deepCopier.initialize(new Configuration());
+ Pair<Integer, Person> deepCopyPair = deepCopier.deepCopy(inputPair);
+
+ assertNull(deepCopyPair);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/TupleFactoryTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/TupleFactoryTest.java b/crunch-core/src/test/java/org/apache/crunch/types/TupleFactoryTest.java
new file mode 100644
index 0000000..0726be2
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/TupleFactoryTest.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.Tuple;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+import org.junit.Test;
+
+public class TupleFactoryTest {
+
+ @Test
+ public void testGetTupleFactory_Pair() {
+ assertEquals(TupleFactory.PAIR, TupleFactory.getTupleFactory(Pair.class));
+ }
+
+ @Test
+ public void testGetTupleFactory_Tuple3() {
+ assertEquals(TupleFactory.TUPLE3, TupleFactory.getTupleFactory(Tuple3.class));
+ }
+
+ @Test
+ public void testGetTupleFactory_Tuple4() {
+ assertEquals(TupleFactory.TUPLE4, TupleFactory.getTupleFactory(Tuple4.class));
+ }
+
+ @Test
+ public void testGetTupleFactory_TupleN() {
+ assertEquals(TupleFactory.TUPLEN, TupleFactory.getTupleFactory(TupleN.class));
+ }
+
+ public void testGetTupleFactory_CustomTupleClass() {
+ TupleFactory<CustomTupleImplementation> customTupleFactory = TupleFactory.create(CustomTupleImplementation.class);
+ assertEquals(customTupleFactory, TupleFactory.getTupleFactory(CustomTupleImplementation.class));
+ }
+
+ private static class CustomTupleImplementation implements Tuple {
+
+ @Override
+ public Object get(int index) {
+ return null;
+ }
+
+ @Override
+ public int size() {
+ return 0;
+ }
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroDeepCopierTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroDeepCopierTest.java b/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroDeepCopierTest.java
new file mode 100644
index 0000000..37c13c0
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroDeepCopierTest.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertNull;
+
+import java.util.List;
+
+import org.apache.avro.generic.GenericData.Record;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.types.avro.AvroDeepCopier.AvroSpecificDeepCopier;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class AvroDeepCopierTest {
+
+ @Test
+ public void testDeepCopySpecific() {
+ Person person = new Person();
+ person.name = "John Doe";
+ person.age = 42;
+ person.siblingnames = Lists.<CharSequence> newArrayList();
+
+ Person deepCopyPerson = new AvroSpecificDeepCopier<Person>(Person.class, Person.SCHEMA$)
+ .deepCopy(person);
+
+ assertEquals(person, deepCopyPerson);
+ assertNotSame(person, deepCopyPerson);
+ }
+
+ @Test
+ public void testDeepCopyGeneric() {
+ Record record = new Record(Person.SCHEMA$);
+ record.put("name", "John Doe");
+ record.put("age", 42);
+ record.put("siblingnames", Lists.newArrayList());
+
+ Record deepCopyRecord = new AvroDeepCopier.AvroGenericDeepCopier(Person.SCHEMA$)
+ .deepCopy(record);
+
+ assertEquals(record, deepCopyRecord);
+ assertNotSame(record, deepCopyRecord);
+ }
+
+ static class ReflectedPerson {
+ String name;
+ int age;
+ List<String> siblingnames;
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == null || !(other instanceof ReflectedPerson)) {
+ return false;
+ }
+ ReflectedPerson that = (ReflectedPerson) other;
+ return name.equals(that.name) && age == that.age && siblingnames.equals(that.siblingnames);
+ }
+ }
+
+ @Test
+ public void testDeepCopyReflect() {
+ ReflectedPerson person = new ReflectedPerson();
+ person.name = "John Doe";
+ person.age = 42;
+ person.siblingnames = Lists.newArrayList();
+
+ AvroDeepCopier<ReflectedPerson> avroDeepCopier = new AvroDeepCopier.AvroReflectDeepCopier<ReflectedPerson>(
+ ReflectedPerson.class, Avros.reflects(ReflectedPerson.class).getSchema());
+ avroDeepCopier.initialize(new Configuration());
+
+ ReflectedPerson deepCopyPerson = avroDeepCopier.deepCopy(person);
+
+ assertEquals(person, deepCopyPerson);
+ assertNotSame(person, deepCopyPerson);
+
+ }
+
+ @Test
+ public void testDeepCopy_Null() {
+ Person person = null;
+
+ Person deepCopyPerson = new AvroSpecificDeepCopier<Person>(Person.class, Person.SCHEMA$)
+ .deepCopy(person);
+
+ assertNull(deepCopyPerson);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroGroupedTableTypeTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroGroupedTableTypeTest.java b/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroGroupedTableTypeTest.java
new file mode 100644
index 0000000..db9ebdc
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroGroupedTableTypeTest.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertSame;
+
+import java.util.List;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.types.PGroupedTableType;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class AvroGroupedTableTypeTest {
+
+ @Test
+ public void testGetDetachedValue() {
+ Integer integerValue = 42;
+ Person person = new Person();
+ person.name = "John Doe";
+ person.age = 42;
+ person.siblingnames = Lists.<CharSequence> newArrayList();
+
+ Iterable<Person> inputPersonIterable = Lists.newArrayList(person);
+ Pair<Integer, Iterable<Person>> pair = Pair.of(integerValue, inputPersonIterable);
+
+ PGroupedTableType<Integer, Person> groupedTableType = Avros.tableOf(Avros.ints(),
+ Avros.specifics(Person.class)).getGroupedTableType();
+ groupedTableType.initialize(new Configuration());
+
+ Pair<Integer, Iterable<Person>> detachedPair = groupedTableType.getDetachedValue(pair);
+
+ assertSame(integerValue, detachedPair.first());
+ List<Person> personList = Lists.newArrayList(detachedPair.second());
+ assertEquals(inputPersonIterable, personList);
+ assertNotSame(person, personList.get(0));
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroTableTypeTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroTableTypeTest.java b/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroTableTypeTest.java
new file mode 100644
index 0000000..35d4e5b
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroTableTypeTest.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class AvroTableTypeTest {
+
+ @Test
+ public void testGetDetachedValue() {
+ Integer integerValue = 42;
+ Person person = new Person();
+ person.name = "John Doe";
+ person.age = 42;
+ person.siblingnames = Lists.<CharSequence> newArrayList();
+
+ Pair<Integer, Person> pair = Pair.of(integerValue, person);
+
+ AvroTableType<Integer, Person> tableType = Avros.tableOf(Avros.ints(),
+ Avros.specifics(Person.class));
+ tableType.initialize(new Configuration());
+
+ Pair<Integer, Person> detachedPair = tableType.getDetachedValue(pair);
+
+ assertSame(integerValue, detachedPair.first());
+ assertEquals(person, detachedPair.second());
+ assertNotSame(person, detachedPair.second());
+ }
+
+ @Test
+ public void testIsReflect_ContainsReflectKey() {
+ assertTrue(Avros.tableOf(Avros.reflects(StringWrapper.class), Avros.ints()).hasReflect());
+ }
+
+ @Test
+ public void testIsReflect_ContainsReflectValue() {
+ assertTrue(Avros.tableOf(Avros.ints(), Avros.reflects(StringWrapper.class)).hasReflect());
+ }
+
+ @Test
+ public void testReflect_NoReflectKeyOrValue() {
+ assertFalse(Avros.tableOf(Avros.ints(), Avros.ints()).hasReflect());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroTypeTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroTypeTest.java b/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroTypeTest.java
new file mode 100644
index 0000000..a874c63
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/avro/AvroTypeTest.java
@@ -0,0 +1,279 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericData.Record;
+import org.apache.crunch.Pair;
+import org.apache.crunch.TupleN;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+public class AvroTypeTest {
+
+ @Test
+ public void testIsSpecific_SpecificData() {
+ assertTrue(Avros.records(Person.class).hasSpecific());
+ }
+
+ @Test
+ public void testIsGeneric_SpecificData() {
+ assertFalse(Avros.records(Person.class).isGeneric());
+ }
+
+ @Test
+ public void testIsSpecific_GenericData() {
+ assertFalse(Avros.generics(Person.SCHEMA$).hasSpecific());
+ }
+
+ @Test
+ public void testIsGeneric_GenericData() {
+ assertTrue(Avros.generics(Person.SCHEMA$).isGeneric());
+ }
+
+ @Test
+ public void testIsSpecific_NonAvroClass() {
+ assertFalse(Avros.ints().hasSpecific());
+ }
+
+ @Test
+ public void testIsGeneric_NonAvroClass() {
+ assertFalse(Avros.ints().isGeneric());
+ }
+
+ @Test
+ public void testIsSpecific_SpecificAvroTable() {
+ assertTrue(Avros.tableOf(Avros.strings(), Avros.records(Person.class)).hasSpecific());
+ }
+
+ @Test
+ public void testIsGeneric_SpecificAvroTable() {
+ assertFalse(Avros.tableOf(Avros.strings(), Avros.records(Person.class)).isGeneric());
+ }
+
+ @Test
+ public void testIsSpecific_GenericAvroTable() {
+ assertFalse(Avros.tableOf(Avros.strings(), Avros.generics(Person.SCHEMA$)).hasSpecific());
+ }
+
+ @Test
+ public void testIsGeneric_GenericAvroTable() {
+ assertFalse(Avros.tableOf(Avros.strings(), Avros.generics(Person.SCHEMA$)).isGeneric());
+ }
+
+ @Test
+ public void testIsReflect_GenericType() {
+ assertFalse(Avros.generics(Person.SCHEMA$).hasReflect());
+ }
+
+ @Test
+ public void testIsReflect_SpecificType() {
+ assertFalse(Avros.records(Person.class).hasReflect());
+ }
+
+ @Test
+ public void testIsReflect_ReflectSimpleType() {
+ assertTrue(Avros.reflects(StringWrapper.class).hasReflect());
+ }
+
+ @Test
+ public void testIsReflect_NonReflectSubType() {
+ assertFalse(Avros.pairs(Avros.ints(), Avros.ints()).hasReflect());
+ }
+
+ @Test
+ public void testIsReflect_ReflectSubType() {
+ assertTrue(Avros.pairs(Avros.ints(), Avros.reflects(StringWrapper.class)).hasReflect());
+ }
+
+ @Test
+ public void testIsReflect_TableOfNonReflectTypes() {
+ assertFalse(Avros.tableOf(Avros.ints(), Avros.strings()).hasReflect());
+ }
+
+ @Test
+ public void testIsReflect_TableWithReflectKey() {
+ assertTrue(Avros.tableOf(Avros.reflects(StringWrapper.class), Avros.ints()).hasReflect());
+ }
+
+ @Test
+ public void testIsReflect_TableWithReflectValue() {
+ assertTrue(Avros.tableOf(Avros.ints(), Avros.reflects(StringWrapper.class)).hasReflect());
+ }
+
+ @Test
+ public void testReflect_CollectionContainingReflectValue() {
+ assertTrue(Avros.collections(Avros.reflects(StringWrapper.class)).hasReflect());
+ }
+
+ @Test
+ public void testReflect_CollectionNotContainingReflectValue() {
+ assertFalse(Avros.collections(Avros.generics(Person.SCHEMA$)).hasReflect());
+ }
+
+ @Test
+ public void testGetDetachedValue_AlreadyMappedAvroType() {
+ Integer value = 42;
+ AvroType<Integer> intType = Avros.ints();
+ intType.initialize(new Configuration());
+ Integer detachedValue = intType.getDetachedValue(value);
+ assertSame(value, detachedValue);
+ }
+
+ @Test
+ public void testGetDetachedValue_GenericAvroType() {
+ AvroType<Record> genericType = Avros.generics(Person.SCHEMA$);
+ genericType.initialize(new Configuration());
+ GenericData.Record record = new GenericData.Record(Person.SCHEMA$);
+ record.put("name", "name value");
+ record.put("age", 42);
+ record.put("siblingnames", Lists.newArrayList());
+
+ Record detachedRecord = genericType.getDetachedValue(record);
+ assertEquals(record, detachedRecord);
+ assertNotSame(record, detachedRecord);
+ }
+
+ private Person createPerson() {
+ Person person = new Person();
+ person.name = "name value";
+ person.age = 42;
+ person.siblingnames = Lists.<CharSequence> newArrayList();
+ return person;
+ }
+
+ @Test
+ public void testGetDetachedValue_SpecificAvroType() {
+ AvroType<Person> specificType = Avros.specifics(Person.class);
+ specificType.initialize(new Configuration());
+ Person person = createPerson();
+ Person detachedPerson = specificType.getDetachedValue(person);
+ assertEquals(person, detachedPerson);
+ assertNotSame(person, detachedPerson);
+ }
+
+ @Test(expected = IllegalStateException.class)
+ public void testGetDetachedValue_NotInitialized() {
+ AvroType<Person> specificType = Avros.specifics(Person.class);
+ Person person = createPerson();
+ specificType.getDetachedValue(person);
+ }
+
+ static class ReflectedPerson {
+ String name;
+ int age;
+ List<String> siblingnames;
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == null || !(other instanceof ReflectedPerson)) {
+ return false;
+ }
+ ReflectedPerson that = (ReflectedPerson) other;
+ return name.equals(that.name) && age == that.age && siblingnames.equals(that.siblingnames);
+ }
+ }
+
+ @Test
+ public void testGetDetachedValue_ReflectAvroType() {
+ AvroType<ReflectedPerson> reflectType = Avros.reflects(ReflectedPerson.class);
+ reflectType.initialize(new Configuration());
+ ReflectedPerson rp = new ReflectedPerson();
+ rp.name = "josh";
+ rp.age = 32;
+ rp.siblingnames = Lists.newArrayList();
+ ReflectedPerson detached = reflectType.getDetachedValue(rp);
+ assertEquals(rp, detached);
+ assertNotSame(rp, detached);
+ }
+
+ @Test
+ public void testGetDetachedValue_Pair() {
+ Person person = createPerson();
+ AvroType<Pair<Integer, Person>> pairType = Avros.pairs(Avros.ints(),
+ Avros.records(Person.class));
+ pairType.initialize(new Configuration());
+
+ Pair<Integer, Person> inputPair = Pair.of(1, person);
+ Pair<Integer, Person> detachedPair = pairType.getDetachedValue(inputPair);
+
+ assertEquals(inputPair, detachedPair);
+ assertNotSame(inputPair.second(), detachedPair.second());
+ }
+
+ @Test
+ public void testGetDetachedValue_Collection() {
+ Person person = createPerson();
+ List<Person> personList = Lists.newArrayList(person);
+
+ AvroType<Collection<Person>> collectionType = Avros.collections(Avros.records(Person.class));
+ collectionType.initialize(new Configuration());
+
+ Collection<Person> detachedCollection = collectionType.getDetachedValue(personList);
+
+ assertEquals(personList, detachedCollection);
+ Person detachedPerson = detachedCollection.iterator().next();
+
+ assertNotSame(person, detachedPerson);
+ }
+
+ @Test
+ public void testGetDetachedValue_Map() {
+ String key = "key";
+ Person value = createPerson();
+
+ Map<String, Person> stringPersonMap = Maps.newHashMap();
+ stringPersonMap.put(key, value);
+
+ AvroType<Map<String, Person>> mapType = Avros.maps(Avros.records(Person.class));
+ mapType.initialize(new Configuration());
+
+ Map<String, Person> detachedMap = mapType.getDetachedValue(stringPersonMap);
+
+ assertEquals(stringPersonMap, detachedMap);
+ assertNotSame(value, detachedMap.get(key));
+ }
+
+ @Test
+ public void testGetDetachedValue_TupleN() {
+ Person person = createPerson();
+ AvroType<TupleN> ptype = Avros.tuples(Avros.records(Person.class));
+ ptype.initialize(new Configuration());
+ TupleN tuple = new TupleN(person);
+ TupleN detachedTuple = ptype.getDetachedValue(tuple);
+
+ assertEquals(tuple, detachedTuple);
+ assertNotSame(person, detachedTuple.get(0));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/avro/AvrosTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/avro/AvrosTest.java b/crunch-core/src/test/java/org/apache/crunch/types/avro/AvrosTest.java
new file mode 100644
index 0000000..5622a56
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/avro/AvrosTest.java
@@ -0,0 +1,325 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertTrue;
+
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.Collections;
+
+import org.apache.avro.Schema;
+import org.apache.avro.Schema.Type;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericData.Record;
+import org.apache.avro.reflect.ReflectData;
+import org.apache.avro.util.Utf8;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+import org.apache.crunch.test.CrunchTestSupport;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.types.DeepCopier;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+
+/**
+ * TODO test Avros.register and Avros.containers
+ */
+public class AvrosTest {
+
+ @Test
+ public void testNulls() throws Exception {
+ Void n = null;
+ testInputOutputFn(Avros.nulls(), n, n);
+ }
+
+ @Test
+ public void testStrings() throws Exception {
+ String s = "abc";
+ Utf8 w = new Utf8(s);
+ testInputOutputFn(Avros.strings(), s, w);
+ }
+
+ @Test
+ public void testInts() throws Exception {
+ int j = 55;
+ testInputOutputFn(Avros.ints(), j, j);
+ }
+
+ @Test
+ public void testLongs() throws Exception {
+ long j = Long.MAX_VALUE;
+ testInputOutputFn(Avros.longs(), j, j);
+ }
+
+ @Test
+ public void testFloats() throws Exception {
+ float j = Float.MIN_VALUE;
+ testInputOutputFn(Avros.floats(), j, j);
+ }
+
+ @Test
+ public void testDoubles() throws Exception {
+ double j = Double.MIN_VALUE;
+ testInputOutputFn(Avros.doubles(), j, j);
+ }
+
+ @Test
+ public void testBooleans() throws Exception {
+ boolean j = true;
+ testInputOutputFn(Avros.booleans(), j, j);
+ }
+
+ @Test
+ public void testBytes() throws Exception {
+ byte[] bytes = new byte[] { 17, 26, -98 };
+ ByteBuffer bb = ByteBuffer.wrap(bytes);
+ testInputOutputFn(Avros.bytes(), bb, bb);
+ }
+
+ @Test
+ public void testCollections() throws Exception {
+ Collection<String> j = Lists.newArrayList();
+ j.add("a");
+ j.add("b");
+ Schema collectionSchema = Schema.createArray(Schema.createUnion(ImmutableList.of(Avros.strings().getSchema(),
+ Schema.create(Type.NULL))));
+ GenericData.Array<Utf8> w = new GenericData.Array<Utf8>(2, collectionSchema);
+ w.add(new Utf8("a"));
+ w.add(new Utf8("b"));
+ testInputOutputFn(Avros.collections(Avros.strings()), j, w);
+ }
+
+ @Test
+ public void testNestedTables() throws Exception {
+ PTableType<Long, Long> pll = Avros.tableOf(Avros.longs(), Avros.longs());
+ String schema = Avros.tableOf(pll, Avros.strings()).getSchema().toString();
+ assertNotNull(schema);
+ }
+
+ @Test
+ public void testPairs() throws Exception {
+ AvroType<Pair<String, String>> at = Avros.pairs(Avros.strings(), Avros.strings());
+ Pair<String, String> j = Pair.of("a", "b");
+ GenericData.Record w = new GenericData.Record(at.getSchema());
+ w.put(0, new Utf8("a"));
+ w.put(1, new Utf8("b"));
+ testInputOutputFn(at, j, w);
+ }
+
+ @Test
+ public void testPairEquals() throws Exception {
+ AvroType<Pair<Long, ByteBuffer>> at1 = Avros.pairs(Avros.longs(), Avros.bytes());
+ AvroType<Pair<Long, ByteBuffer>> at2 = Avros.pairs(Avros.longs(), Avros.bytes());
+ assertEquals(at1, at2);
+ assertEquals(at1.hashCode(), at2.hashCode());
+ }
+
+ @Test
+ @SuppressWarnings("rawtypes")
+ public void testTriples() throws Exception {
+ AvroType at = Avros.triples(Avros.strings(), Avros.strings(), Avros.strings());
+ Tuple3 j = Tuple3.of("a", "b", "c");
+ GenericData.Record w = new GenericData.Record(at.getSchema());
+ w.put(0, new Utf8("a"));
+ w.put(1, new Utf8("b"));
+ w.put(2, new Utf8("c"));
+ testInputOutputFn(at, j, w);
+ }
+
+ @Test
+ @SuppressWarnings("rawtypes")
+ public void testQuads() throws Exception {
+ AvroType at = Avros.quads(Avros.strings(), Avros.strings(), Avros.strings(), Avros.strings());
+ Tuple4 j = Tuple4.of("a", "b", "c", "d");
+ GenericData.Record w = new GenericData.Record(at.getSchema());
+ w.put(0, new Utf8("a"));
+ w.put(1, new Utf8("b"));
+ w.put(2, new Utf8("c"));
+ w.put(3, new Utf8("d"));
+ testInputOutputFn(at, j, w);
+ }
+
+ @Test
+ @SuppressWarnings("rawtypes")
+ public void testTupleN() throws Exception {
+ AvroType at = Avros.tuples(Avros.strings(), Avros.strings(), Avros.strings(), Avros.strings(), Avros.strings());
+ TupleN j = new TupleN("a", "b", "c", "d", "e");
+ GenericData.Record w = new GenericData.Record(at.getSchema());
+ w.put(0, new Utf8("a"));
+ w.put(1, new Utf8("b"));
+ w.put(2, new Utf8("c"));
+ w.put(3, new Utf8("d"));
+ w.put(4, new Utf8("e"));
+ testInputOutputFn(at, j, w);
+
+ }
+
+ @Test
+ @SuppressWarnings("rawtypes")
+ public void testWritables() throws Exception {
+ AvroType at = Avros.writables(LongWritable.class);
+
+ TaskInputOutputContext<?, ?, ?, ?> testContext = CrunchTestSupport.getTestContext(new Configuration());
+ at.getInputMapFn().setContext(testContext);
+ at.getInputMapFn().initialize();
+ at.getOutputMapFn().setContext(testContext);
+ at.getOutputMapFn().initialize();
+
+ LongWritable lw = new LongWritable(1729L);
+ assertEquals(lw, at.getInputMapFn().map(at.getOutputMapFn().map(lw)));
+ }
+
+ @Test
+ @SuppressWarnings("rawtypes")
+ public void testTableOf() throws Exception {
+ AvroType at = Avros.tableOf(Avros.strings(), Avros.strings());
+ Pair<String, String> j = Pair.of("a", "b");
+ org.apache.avro.mapred.Pair w = new org.apache.avro.mapred.Pair(at.getSchema());
+ w.put(0, new Utf8("a"));
+ w.put(1, new Utf8("b"));
+ // TODO update this after resolving the o.a.a.m.Pair.equals issue
+ initialize(at);
+ assertEquals(j, at.getInputMapFn().map(w));
+ org.apache.avro.mapred.Pair converted = (org.apache.avro.mapred.Pair) at.getOutputMapFn().map(j);
+ assertEquals(w.key(), converted.key());
+ assertEquals(w.value(), converted.value());
+ }
+
+ private static void initialize(PType ptype) {
+ ptype.getInputMapFn().initialize();
+ ptype.getOutputMapFn().initialize();
+ }
+
+ @SuppressWarnings({ "unchecked", "rawtypes" })
+ protected static void testInputOutputFn(PType ptype, Object java, Object avro) {
+ initialize(ptype);
+ assertEquals(java, ptype.getInputMapFn().map(avro));
+ assertEquals(avro, ptype.getOutputMapFn().map(java));
+ }
+
+ @Test
+ public void testIsPrimitive_PrimitiveMappedType() {
+ assertTrue(Avros.isPrimitive(Avros.ints()));
+ }
+
+ @Test
+ public void testIsPrimitive_TruePrimitiveValue() {
+ AvroType truePrimitiveAvroType = new AvroType(int.class, Schema.create(Type.INT), new DeepCopier.NoOpDeepCopier());
+ assertTrue(Avros.isPrimitive(truePrimitiveAvroType));
+ }
+
+ @Test
+ public void testIsPrimitive_False() {
+ assertFalse(Avros.isPrimitive(Avros.reflects(Person.class)));
+ }
+
+ @Test
+ public void testPairs_Generic() {
+ Schema schema = ReflectData.get().getSchema(IntWritable.class);
+
+ GenericData.Record recordA = new GenericData.Record(schema);
+ GenericData.Record recordB = new GenericData.Record(schema);
+
+ AvroType<Pair<Record, Record>> pairType = Avros.pairs(Avros.generics(schema), Avros.generics(schema));
+ Pair<Record, Record> pair = Pair.of(recordA, recordB);
+ pairType.getOutputMapFn().initialize();
+ pairType.getInputMapFn().initialize();
+ Object mapped = pairType.getOutputMapFn().map(pair);
+ Pair<Record, Record> doubleMappedPair = pairType.getInputMapFn().map(mapped);
+
+ assertEquals(pair, doubleMappedPair);
+ mapped.hashCode();
+ }
+
+ @Test
+ public void testPairs_Reflect() {
+ IntWritable intWritableA = new IntWritable(1);
+ IntWritable intWritableB = new IntWritable(2);
+
+ AvroType<Pair<IntWritable, IntWritable>> pairType = Avros.pairs(Avros.reflects(IntWritable.class),
+ Avros.reflects(IntWritable.class));
+ Pair<IntWritable, IntWritable> pair = Pair.of(intWritableA, intWritableB);
+ pairType.getOutputMapFn().initialize();
+ pairType.getInputMapFn().initialize();
+ Object mapped = pairType.getOutputMapFn().map(pair);
+
+ Pair<IntWritable, IntWritable> doubleMappedPair = pairType.getInputMapFn().map(mapped);
+
+ assertEquals(pair, doubleMappedPair);
+ }
+
+ @Test
+ public void testPairs_Specific() {
+ Person personA = new Person();
+ Person personB = new Person();
+
+ personA.age = 1;
+ personA.name = "A";
+ personA.siblingnames = Collections.<CharSequence> emptyList();
+
+ personB.age = 2;
+ personB.name = "B";
+ personB.siblingnames = Collections.<CharSequence> emptyList();
+
+ AvroType<Pair<Person, Person>> pairType = Avros.pairs(Avros.records(Person.class), Avros.records(Person.class));
+
+ Pair<Person, Person> pair = Pair.of(personA, personB);
+ pairType.getOutputMapFn().initialize();
+ pairType.getInputMapFn().initialize();
+
+ Object mapped = pairType.getOutputMapFn().map(pair);
+ Pair<Person, Person> doubleMappedPair = pairType.getInputMapFn().map(mapped);
+
+ assertEquals(pair, doubleMappedPair);
+
+ }
+
+ @Test
+ public void testPairOutputMapFn_VerifyNoObjectReuse() {
+ StringWrapper stringWrapper = new StringWrapper("Test");
+
+ Pair<Integer, StringWrapper> pair = Pair.of(1, stringWrapper);
+
+ AvroType<Pair<Integer, StringWrapper>> pairType = Avros.pairs(Avros.ints(), Avros.reflects(StringWrapper.class));
+
+ pairType.getOutputMapFn().initialize();
+
+ Object outputMappedValueA = pairType.getOutputMapFn().map(pair);
+ Object outputMappedValueB = pairType.getOutputMapFn().map(pair);
+
+ assertEquals(outputMappedValueA, outputMappedValueB);
+ assertNotSame(outputMappedValueA, outputMappedValueB);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/writable/GenericArrayWritableTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/writable/GenericArrayWritableTest.java b/crunch-core/src/test/java/org/apache/crunch/types/writable/GenericArrayWritableTest.java
new file mode 100644
index 0000000..c807a90
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/writable/GenericArrayWritableTest.java
@@ -0,0 +1,70 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import static org.hamcrest.Matchers.hasItems;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.not;
+import static org.hamcrest.Matchers.sameInstance;
+import static org.junit.Assert.assertThat;
+
+import java.util.Arrays;
+
+import org.apache.crunch.test.Tests;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.junit.Test;
+
+
+public class GenericArrayWritableTest {
+
+ @Test
+ public void testEmpty() {
+ GenericArrayWritable<Text> src = new GenericArrayWritable<Text>(Text.class);
+ src.set(new Text[0]);
+
+ GenericArrayWritable<Text> dest = Tests.roundtrip(src, new GenericArrayWritable<Text>());
+
+ assertThat(dest.get().length, is(0));
+ }
+
+ @Test
+ public void testNonEmpty() {
+ GenericArrayWritable<Text> src = new GenericArrayWritable<Text>(Text.class);
+ src.set(new Text[] { new Text("foo"), new Text("bar") });
+
+ GenericArrayWritable<Text> dest = Tests.roundtrip(src, new GenericArrayWritable<Text>());
+
+ assertThat(src.get(), not(sameInstance(dest.get())));
+ assertThat(dest.get().length, is(2));
+ assertThat(Arrays.asList(dest.get()), hasItems((Writable) new Text("foo"), new Text("bar")));
+ }
+
+ @Test
+ public void testNulls() {
+ GenericArrayWritable<Text> src = new GenericArrayWritable<Text>(Text.class);
+ src.set(new Text[] { new Text("a"), null, new Text("b") });
+
+ GenericArrayWritable<Text> dest = Tests.roundtrip(src, new GenericArrayWritable<Text>());
+
+ assertThat(src.get(), not(sameInstance(dest.get())));
+ assertThat(dest.get().length, is(3));
+ assertThat(Arrays.asList(dest.get()), hasItems((Writable) new Text("a"), new Text("b"), null));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableDeepCopierTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableDeepCopierTest.java b/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableDeepCopierTest.java
new file mode 100644
index 0000000..c49491b
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableDeepCopierTest.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertNull;
+
+import org.apache.hadoop.io.Text;
+import org.junit.Before;
+import org.junit.Test;
+
+public class WritableDeepCopierTest {
+
+ private WritableDeepCopier<Text> deepCopier;
+
+ @Before
+ public void setUp() {
+ deepCopier = new WritableDeepCopier<Text>(Text.class);
+ }
+
+ @Test
+ public void testDeepCopy() {
+ Text text = new Text("value");
+ Text deepCopy = deepCopier.deepCopy(text);
+
+ assertEquals(text, deepCopy);
+ assertNotSame(text, deepCopy);
+ }
+
+ @Test
+ public void testDeepCopy_Null() {
+ Text text = null;
+ Text deepCopy = deepCopier.deepCopy(text);
+
+ assertNull(deepCopy);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableGroupedTableTypeTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableGroupedTableTypeTest.java b/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableGroupedTableTypeTest.java
new file mode 100644
index 0000000..f6c201b
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableGroupedTableTypeTest.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertSame;
+
+import java.util.List;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.PGroupedTableType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class WritableGroupedTableTypeTest {
+
+ @Test
+ public void testGetDetachedValue() {
+ Integer integerValue = 42;
+ Text textValue = new Text("forty-two");
+ Iterable<Text> inputTextIterable = Lists.newArrayList(textValue);
+ Pair<Integer, Iterable<Text>> pair = Pair.of(integerValue, inputTextIterable);
+
+ PGroupedTableType<Integer, Text> groupedTableType = Writables.tableOf(Writables.ints(),
+ Writables.writables(Text.class)).getGroupedTableType();
+ groupedTableType.initialize(new Configuration());
+
+ Pair<Integer, Iterable<Text>> detachedPair = groupedTableType.getDetachedValue(pair);
+
+ assertSame(integerValue, detachedPair.first());
+ List<Text> textList = Lists.newArrayList(detachedPair.second());
+ assertEquals(inputTextIterable, textList);
+ assertNotSame(textValue, textList.get(0));
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableTableTypeTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableTableTypeTest.java b/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableTableTypeTest.java
new file mode 100644
index 0000000..697a28c
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableTableTypeTest.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertSame;
+
+import org.apache.crunch.Pair;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.junit.Test;
+
+public class WritableTableTypeTest {
+
+ @Test
+ public void testGetDetachedValue() {
+ Integer integerValue = 42;
+ Text textValue = new Text("forty-two");
+ Pair<Integer, Text> pair = Pair.of(integerValue, textValue);
+
+ WritableTableType<Integer, Text> tableType = Writables.tableOf(Writables.ints(),
+ Writables.writables(Text.class));
+ tableType.initialize(new Configuration());
+ Pair<Integer, Text> detachedPair = tableType.getDetachedValue(pair);
+
+ assertSame(integerValue, detachedPair.first());
+ assertEquals(textValue, detachedPair.second());
+ assertNotSame(textValue, detachedPair.second());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableTypeTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableTypeTest.java b/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableTypeTest.java
new file mode 100644
index 0000000..65e946b
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/types/writable/WritableTypeTest.java
@@ -0,0 +1,97 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotSame;
+
+import java.util.Collection;
+import java.util.Map;
+
+import org.apache.crunch.Pair;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.io.Text;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+public class WritableTypeTest {
+
+ @Test(expected = IllegalStateException.class)
+ public void testGetDetachedValue_NotInitialized() {
+ WritableType<Text, Text> textWritableType = Writables.writables(Text.class);
+ Text value = new Text("test");
+
+ // Calling getDetachedValue without first calling initialize should throw an
+ // exception
+ textWritableType.getDetachedValue(value);
+ }
+
+ @Test
+ public void testGetDetachedValue_CustomWritable() {
+ WritableType<Text, Text> textWritableType = Writables.writables(Text.class);
+ textWritableType.initialize(new Configuration());
+ Text value = new Text("test");
+
+ Text detachedValue = textWritableType.getDetachedValue(value);
+ assertEquals(value, detachedValue);
+ assertNotSame(value, detachedValue);
+ }
+
+ @Test
+ public void testGetDetachedValue_Collection() {
+ Collection<Text> textCollection = Lists.newArrayList(new Text("value"));
+ WritableType<Collection<Text>, GenericArrayWritable<Text>> ptype = Writables
+ .collections(Writables.writables(Text.class));
+ ptype.initialize(new Configuration());
+
+ Collection<Text> detachedCollection = ptype.getDetachedValue(textCollection);
+ assertEquals(textCollection, detachedCollection);
+ assertNotSame(textCollection.iterator().next(), detachedCollection.iterator().next());
+ }
+
+ @Test
+ public void testGetDetachedValue_Tuple() {
+ Pair<Text, Text> textPair = Pair.of(new Text("one"), new Text("two"));
+ WritableType<Pair<Text, Text>, TupleWritable> ptype = Writables.pairs(
+ Writables.writables(Text.class), Writables.writables(Text.class));
+ ptype.initialize(new Configuration());
+
+ Pair<Text, Text> detachedPair = ptype.getDetachedValue(textPair);
+ assertEquals(textPair, detachedPair);
+ assertNotSame(textPair.first(), detachedPair.first());
+ assertNotSame(textPair.second(), detachedPair.second());
+ }
+
+ @Test
+ public void testGetDetachedValue_Map() {
+ Map<String, Text> stringTextMap = Maps.newHashMap();
+ stringTextMap.put("key", new Text("value"));
+
+ WritableType<Map<String, Text>, MapWritable> ptype = Writables.maps(Writables
+ .writables(Text.class));
+ ptype.initialize(new Configuration());
+ Map<String, Text> detachedMap = ptype.getDetachedValue(stringTextMap);
+
+ assertEquals(stringTextMap, detachedMap);
+ assertNotSame(stringTextMap.get("key"), detachedMap.get("key"));
+ }
+
+}
[24/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/fn/AggregatorsTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/fn/AggregatorsTest.java b/crunch-core/src/test/java/org/apache/crunch/fn/AggregatorsTest.java
new file mode 100644
index 0000000..6ee1972
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/fn/AggregatorsTest.java
@@ -0,0 +1,239 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import static org.apache.crunch.fn.Aggregators.MAX_BIGINTS;
+import static org.apache.crunch.fn.Aggregators.MAX_DOUBLES;
+import static org.apache.crunch.fn.Aggregators.MAX_FLOATS;
+import static org.apache.crunch.fn.Aggregators.MAX_INTS;
+import static org.apache.crunch.fn.Aggregators.MAX_LONGS;
+import static org.apache.crunch.fn.Aggregators.MAX_N;
+import static org.apache.crunch.fn.Aggregators.MIN_BIGINTS;
+import static org.apache.crunch.fn.Aggregators.MIN_DOUBLES;
+import static org.apache.crunch.fn.Aggregators.MIN_FLOATS;
+import static org.apache.crunch.fn.Aggregators.MIN_INTS;
+import static org.apache.crunch.fn.Aggregators.MIN_LONGS;
+import static org.apache.crunch.fn.Aggregators.MIN_N;
+import static org.apache.crunch.fn.Aggregators.STRING_CONCAT;
+import static org.apache.crunch.fn.Aggregators.SUM_BIGINTS;
+import static org.apache.crunch.fn.Aggregators.SUM_DOUBLES;
+import static org.apache.crunch.fn.Aggregators.SUM_FLOATS;
+import static org.apache.crunch.fn.Aggregators.SUM_INTS;
+import static org.apache.crunch.fn.Aggregators.SUM_LONGS;
+import static org.hamcrest.Matchers.closeTo;
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThat;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.crunch.Aggregator;
+import org.apache.crunch.CombineFn;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+import org.apache.crunch.impl.mem.emit.InMemoryEmitter;
+import org.junit.Test;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterables;
+
+
+public class AggregatorsTest {
+
+ @Test
+ public void testSums2() {
+ assertThat(sapply(SUM_INTS(), 1, 2, 3, -4), is(2));
+ assertThat(sapply(SUM_LONGS(), 1L, 2L, 3L, -4L, 5000000000L), is(5000000002L));
+ assertThat(sapply(SUM_FLOATS(), 1f, 2f, 3f, -4f), is(2f));
+ assertThat(sapply(SUM_DOUBLES(), 0.1, 0.2, 0.3), is(closeTo(0.6, 0.00001)));
+ assertThat(sapply(SUM_BIGINTS(), bigInt("7"), bigInt("3")), is(bigInt("10")));
+ }
+
+ @Test
+ public void testSums() {
+ assertThat(sapply(SUM_LONGS(), 29L, 17L, 1729L), is(1775L));
+ assertThat(sapply(SUM_LONGS(), 29L, 7L, 1729L), is(1765L));
+ assertThat(sapply(SUM_INTS(), 29, 17, 1729), is(1775));
+ assertThat(sapply(SUM_FLOATS(), 29f, 17f, 1729f), is(1775.0f));
+ assertThat(sapply(SUM_DOUBLES(), 29.0, 17.0, 1729.0), is(1775.0));
+ assertThat(sapply(SUM_BIGINTS(), bigInt("29"), bigInt("17"), bigInt("1729")), is(bigInt("1775")));
+ }
+
+ @Test
+ public void testMax() {
+ assertThat(sapply(MAX_LONGS(), 29L, 17L, 1729L), is(1729L));
+ assertThat(sapply(MAX_INTS(), 29, 17, 1729), is(1729));
+ assertThat(sapply(MAX_FLOATS(), 29f, 17f, 1729f), is(1729.0f));
+ assertThat(sapply(MAX_DOUBLES(), 29.0, 17.0, 1729.0), is(1729.0));
+ assertThat(sapply(MAX_FLOATS(), 29f, 1745f, 17f, 1729f), is(1745.0f));
+ assertThat(sapply(MAX_BIGINTS(), bigInt("29"), bigInt("17"), bigInt("1729")), is(bigInt("1729")));
+ }
+
+ @Test
+ public void testMin() {
+ assertThat(sapply(MIN_LONGS(), 29L, 17L, 1729L), is(17L));
+ assertThat(sapply(MIN_INTS(), 29, 17, 1729), is(17));
+ assertThat(sapply(MIN_FLOATS(), 29f, 17f, 1729f), is(17.0f));
+ assertThat(sapply(MIN_DOUBLES(), 29.0, 17.0, 1729.0), is(17.0));
+ assertThat(sapply(MIN_INTS(), 29, 170, 1729), is(29));
+ assertThat(sapply(MIN_BIGINTS(), bigInt("29"), bigInt("17"), bigInt("1729")), is(bigInt("17")));
+ }
+
+ @Test
+ public void testMaxN() {
+ assertThat(apply(MAX_INTS(2), 17, 34, 98, 29, 1009), is(ImmutableList.of(98, 1009)));
+ assertThat(apply(MAX_N(1, String.class), "b", "a"), is(ImmutableList.of("b")));
+ assertThat(apply(MAX_N(3, String.class), "b", "a", "d", "c"), is(ImmutableList.of("b", "c", "d")));
+ }
+
+ @Test
+ public void testMinN() {
+ assertThat(apply(MIN_INTS(2), 17, 34, 98, 29, 1009), is(ImmutableList.of(17, 29)));
+ assertThat(apply(MIN_N(1, String.class), "b", "a"), is(ImmutableList.of("a")));
+ assertThat(apply(MIN_N(3, String.class), "b", "a", "d", "c"), is(ImmutableList.of("a", "b", "c")));
+ }
+
+ @Test
+ public void testFirstN() {
+ assertThat(apply(Aggregators.<Integer>FIRST_N(2), 17, 34, 98, 29, 1009), is(ImmutableList.of(17, 34)));
+ }
+
+ @Test
+ public void testLastN() {
+ assertThat(apply(Aggregators.<Integer>LAST_N(2), 17, 34, 98, 29, 1009), is(ImmutableList.of(29, 1009)));
+ }
+
+ @Test
+ public void testUniqueElements() {
+ assertThat(ImmutableSet.copyOf(apply(Aggregators.<Integer>UNIQUE_ELEMENTS(), 17, 29, 29, 16, 17)),
+ is(ImmutableSet.of(17, 29, 16)));
+
+ Iterable<Integer> samp = apply(Aggregators.<Integer>SAMPLE_UNIQUE_ELEMENTS(2), 17, 29, 16, 17, 29, 16);
+ assertThat(Iterables.size(samp), is(2));
+ assertThat(ImmutableSet.copyOf(samp).size(), is(2)); // check that the two elements are unique
+ }
+
+ @Test
+ public void testPairs() {
+ List<Pair<Long, Double>> input = ImmutableList.of(Pair.of(1720L, 17.29), Pair.of(9L, -3.14));
+ Aggregator<Pair<Long, Double>> a = Aggregators.pairAggregator(SUM_LONGS(), MIN_DOUBLES());
+
+ assertThat(sapply(a, input), is(Pair.of(1729L, -3.14)));
+ }
+
+ @Test
+ public void testPairsTwoLongs() {
+ List<Pair<Long, Long>> input = ImmutableList.of(Pair.of(1720L, 1L), Pair.of(9L, 19L));
+ Aggregator<Pair<Long, Long>> a = Aggregators.pairAggregator(SUM_LONGS(), SUM_LONGS());
+
+ assertThat(sapply(a, input), is(Pair.of(1729L, 20L)));
+ }
+
+ @Test
+ public void testTrips() {
+ List<Tuple3<Float, Double, Double>> input = ImmutableList.of(Tuple3.of(17.29f, 12.2, 0.1),
+ Tuple3.of(3.0f, 1.2, 3.14), Tuple3.of(-1.0f, 14.5, -0.98));
+ Aggregator<Tuple3<Float, Double, Double>> a = Aggregators.tripAggregator(
+ MAX_FLOATS(), MAX_DOUBLES(), MIN_DOUBLES());
+
+ assertThat(sapply(a, input), is(Tuple3.of(17.29f, 14.5, -0.98)));
+ }
+
+ @Test
+ public void testQuads() {
+ List<Tuple4<Float, Double, Double, Integer>> input = ImmutableList.of(Tuple4.of(17.29f, 12.2, 0.1, 1),
+ Tuple4.of(3.0f, 1.2, 3.14, 2), Tuple4.of(-1.0f, 14.5, -0.98, 3));
+ Aggregator<Tuple4<Float, Double, Double, Integer>> a = Aggregators.quadAggregator(
+ MAX_FLOATS(), MAX_DOUBLES(), MIN_DOUBLES(), SUM_INTS());
+
+ assertThat(sapply(a, input), is(Tuple4.of(17.29f, 14.5, -0.98, 6)));
+ }
+
+ @Test
+ public void testTupleN() {
+ List<TupleN> input = ImmutableList.of(new TupleN(1, 3.0, 1, 2.0, 4L), new TupleN(4, 17.0, 1, 9.7, 12L));
+ Aggregator<TupleN> a = Aggregators.tupleAggregator(
+ MIN_INTS(), SUM_DOUBLES(), MAX_INTS(), MIN_DOUBLES(), MAX_LONGS());
+
+ assertThat(sapply(a, input), is(new TupleN(1, 20.0, 1, 2.0, 12L)));
+ }
+
+ @Test
+ public void testConcatenation() {
+ assertThat(sapply(STRING_CONCAT("", true), "foo", "foobar", "bar"), is("foofoobarbar"));
+ assertThat(sapply(STRING_CONCAT("/", false), "foo", "foobar", "bar"), is("foo/foobar/bar"));
+ assertThat(sapply(STRING_CONCAT(" ", true), " ", ""), is(" "));
+ assertThat(sapply(STRING_CONCAT(" ", true), Arrays.asList(null, "")), is(""));
+ assertThat(sapply(STRING_CONCAT(" ", true, 20, 3), "foo", "foobar", "bar"), is("foo bar"));
+ assertThat(sapply(STRING_CONCAT(" ", true, 10, 6), "foo", "foobar", "bar"), is("foo foobar"));
+ assertThat(sapply(STRING_CONCAT(" ", true, 9, 6), "foo", "foobar", "bar"), is("foo bar"));
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void testConcatenationNullException() {
+ sapply(STRING_CONCAT(" ", false), Arrays.asList(null, "" ));
+ }
+
+
+ private static <T> T sapply(Aggregator<T> a, T... values) {
+ return sapply(a, ImmutableList.copyOf(values));
+ }
+
+ private static <T> T sapply(Aggregator<T> a, Iterable<T> values) {
+ return Iterables.getOnlyElement(apply(a, values));
+ }
+
+ private static <T> ImmutableList<T> apply(Aggregator<T> a, T... values) {
+ return apply(a, ImmutableList.copyOf(values));
+ }
+
+ private static <T> ImmutableList<T> apply(Aggregator<T> a, Iterable<T> values) {
+ CombineFn<String, T> fn = Aggregators.toCombineFn(a);
+
+ InMemoryEmitter<Pair<String, T>> e1 = new InMemoryEmitter<Pair<String,T>>();
+ fn.process(Pair.of("", values), e1);
+
+ // and a second time to make sure Aggregator.reset() works
+ InMemoryEmitter<Pair<String, T>> e2 = new InMemoryEmitter<Pair<String,T>>();
+ fn.process(Pair.of("", values), e2);
+
+ assertEquals(getValues(e1), getValues(e2));
+
+ return getValues(e1);
+ }
+
+ private static <K, V> ImmutableList<V> getValues(InMemoryEmitter<Pair<K, V>> emitter) {
+ return ImmutableList.copyOf(
+ Iterables.transform(emitter.getOutput(), new Function<Pair<K, V>, V>() {
+ @Override
+ public V apply(Pair<K, V> input) {
+ return input.second();
+ }
+ }));
+ }
+
+ private static BigInteger bigInt(String value) {
+ return new BigInteger(value);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/fn/ExtractKeyFnTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/fn/ExtractKeyFnTest.java b/crunch-core/src/test/java/org/apache/crunch/fn/ExtractKeyFnTest.java
new file mode 100644
index 0000000..b5b2a1b
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/fn/ExtractKeyFnTest.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Pair;
+import org.junit.Test;
+
+@SuppressWarnings("serial")
+public class ExtractKeyFnTest {
+
+ protected static final MapFn<String, Integer> mapFn = new MapFn<String, Integer>() {
+ @Override
+ public Integer map(String input) {
+ return input.hashCode();
+ }
+ };
+
+ protected static final ExtractKeyFn<Integer, String> one = new ExtractKeyFn<Integer, String>(mapFn);
+
+ @Test
+ public void test() {
+ StoreLastEmitter<Pair<Integer, String>> emitter = StoreLastEmitter.create();
+ one.process("boza", emitter);
+ assertEquals(Pair.of("boza".hashCode(), "boza"), emitter.getLast());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/fn/FilterFnTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/fn/FilterFnTest.java b/crunch-core/src/test/java/org/apache/crunch/fn/FilterFnTest.java
new file mode 100644
index 0000000..a649f99
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/fn/FilterFnTest.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.assertThat;
+
+import org.apache.crunch.FilterFn;
+import org.junit.Test;
+
+import com.google.common.base.Predicates;
+
+
+public class FilterFnTest {
+
+ private static final FilterFn<String> TRUE = FilterFns.<String>ACCEPT_ALL();
+ private static final FilterFn<String> FALSE = FilterFns.<String>REJECT_ALL();
+
+ @Test
+ public void testAcceptAll() {
+ assertThat(TRUE.accept(""), is(true));
+ assertThat(TRUE.accept("foo"), is(true));
+ }
+
+ @Test
+ public void testRejectAll() {
+ assertThat(FALSE.accept(""), is(false));
+ assertThat(FALSE.accept("foo"), is(false));
+
+ Predicates.or(Predicates.alwaysFalse(), Predicates.alwaysTrue());
+ }
+
+ @Test
+ public void testAnd() {
+ assertThat(FilterFns.and(TRUE, TRUE).accept("foo"), is(true));
+ assertThat(FilterFns.and(TRUE, FALSE).accept("foo"), is(false));
+ }
+
+ @Test
+ @SuppressWarnings("unchecked")
+ public void testGeneric() {
+ assertThat(FilterFns.and(TRUE).accept("foo"), is(true));
+ assertThat(FilterFns.and(FALSE).accept("foo"), is(false));
+ assertThat(FilterFns.and(FALSE, FALSE, FALSE).accept("foo"), is(false));
+ assertThat(FilterFns.and(TRUE, TRUE, FALSE).accept("foo"), is(false));
+ assertThat(FilterFns.and(FALSE, FALSE, FALSE, FALSE).accept("foo"), is(false));
+ }
+
+ @Test
+ public void testOr() {
+ assertThat(FilterFns.or(FALSE, TRUE).accept("foo"), is(true));
+ assertThat(FilterFns.or(TRUE, FALSE).accept("foo"), is(true));
+ }
+
+ @Test
+ @SuppressWarnings("unchecked")
+ public void testOrGeneric() {
+ assertThat(FilterFns.or(TRUE).accept("foo"), is(true));
+ assertThat(FilterFns.or(FALSE).accept("foo"), is(false));
+ assertThat(FilterFns.or(TRUE, FALSE, TRUE).accept("foo"), is(true));
+ assertThat(FilterFns.or(FALSE, FALSE, TRUE).accept("foo"), is(true));
+ assertThat(FilterFns.or(FALSE, FALSE, FALSE).accept("foo"), is(false));
+ }
+
+ @Test
+ public void testNot() {
+ assertThat(FilterFns.not(TRUE).accept("foo"), is(false));
+ assertThat(FilterFns.not(FALSE).accept("foo"), is(true));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/fn/MapKeysTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/fn/MapKeysTest.java b/crunch-core/src/test/java/org/apache/crunch/fn/MapKeysTest.java
new file mode 100644
index 0000000..6b73700
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/fn/MapKeysTest.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.crunch.Pair;
+import org.junit.Test;
+
+@SuppressWarnings("serial")
+public class MapKeysTest {
+
+ protected static final MapKeysFn<String, Integer, Integer> one = new MapKeysFn<String, Integer, Integer>() {
+ @Override
+ public Integer map(String input) {
+ return 1;
+ }
+ };
+
+ protected static final MapKeysFn<String, Integer, Integer> two = new MapKeysFn<String, Integer, Integer>() {
+ @Override
+ public Integer map(String input) {
+ return 2;
+ }
+ };
+
+ @Test
+ public void test() {
+ StoreLastEmitter<Pair<Integer, Integer>> emitter = StoreLastEmitter.create();
+ one.process(Pair.of("k", Integer.MAX_VALUE), emitter);
+ assertEquals(Pair.of(1, Integer.MAX_VALUE), emitter.getLast());
+ two.process(Pair.of("k", Integer.MAX_VALUE), emitter);
+ assertEquals(Pair.of(2, Integer.MAX_VALUE), emitter.getLast());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/fn/MapValuesTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/fn/MapValuesTest.java b/crunch-core/src/test/java/org/apache/crunch/fn/MapValuesTest.java
new file mode 100644
index 0000000..097b008
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/fn/MapValuesTest.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.crunch.Pair;
+import org.junit.Test;
+
+@SuppressWarnings("serial")
+public class MapValuesTest {
+
+ static final MapValuesFn<String, String, Integer> one = new MapValuesFn<String, String, Integer>() {
+ @Override
+ public Integer map(String input) {
+ return 1;
+ }
+ };
+
+ static final MapValuesFn<String, String, Integer> two = new MapValuesFn<String, String, Integer>() {
+ @Override
+ public Integer map(String input) {
+ return 2;
+ }
+ };
+
+ @Test
+ public void test() {
+ StoreLastEmitter<Pair<String, Integer>> emitter = StoreLastEmitter.create();
+ one.process(Pair.of("k", "v"), emitter);
+ assertEquals(Pair.of("k", 1), emitter.getLast());
+ two.process(Pair.of("k", "v"), emitter);
+ assertEquals(Pair.of("k", 2), emitter.getLast());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/fn/PairMapTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/fn/PairMapTest.java b/crunch-core/src/test/java/org/apache/crunch/fn/PairMapTest.java
new file mode 100644
index 0000000..bef6c85
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/fn/PairMapTest.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import static org.junit.Assert.assertTrue;
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Pair;
+import org.junit.Test;
+
+@SuppressWarnings("serial")
+public class PairMapTest {
+
+ static final MapFn<String, Integer> one = new MapFn<String, Integer>() {
+ @Override
+ public Integer map(String input) {
+ return 1;
+ }
+ };
+
+ static final MapFn<String, Integer> two = new MapFn<String, Integer>() {
+ @Override
+ public Integer map(String input) {
+ return 2;
+ }
+ };
+
+ @Test
+ public void testPairMap() {
+ StoreLastEmitter<Pair<Integer, Integer>> emitter = StoreLastEmitter.create();
+ PairMapFn<String, String, Integer, Integer> fn = new PairMapFn<String, String, Integer, Integer>(one, two);
+ fn.process(Pair.of("a", "b"), emitter);
+ Pair<Integer, Integer> pair = emitter.getLast();
+ assertTrue(pair.first() == 1);
+ assertTrue(pair.second() == 2);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/fn/StoreLastEmitter.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/fn/StoreLastEmitter.java b/crunch-core/src/test/java/org/apache/crunch/fn/StoreLastEmitter.java
new file mode 100644
index 0000000..cdd8754
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/fn/StoreLastEmitter.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.fn;
+
+import org.apache.crunch.Emitter;
+
+class StoreLastEmitter<T> implements Emitter<T> {
+ private T last;
+
+ @Override
+ public void emit(T emitted) {
+ last = emitted;
+ }
+
+ public T getLast() {
+ return last;
+ }
+
+ @Override
+ public void flush() {
+ }
+
+ public static <T> StoreLastEmitter<T> create() {
+ return new StoreLastEmitter<T>();
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/impl/SingleUseIterableTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/impl/SingleUseIterableTest.java b/crunch-core/src/test/java/org/apache/crunch/impl/SingleUseIterableTest.java
new file mode 100644
index 0000000..811a0a3
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/impl/SingleUseIterableTest.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.List;
+
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class SingleUseIterableTest {
+
+ @Test
+ public void testIterator() {
+ List<Integer> values = Lists.newArrayList(1,2,3);
+
+ SingleUseIterable<Integer> iterable = new SingleUseIterable<Integer>(values);
+
+ List<Integer> retrievedValues = Lists.newArrayList(iterable);
+
+ assertEquals(values, retrievedValues);
+ }
+
+ @Test(expected=IllegalStateException.class)
+ public void testIterator_MultipleCalls() {
+ List<Integer> values = Lists.newArrayList(1,2,3);
+
+ SingleUseIterable<Integer> iterable = new SingleUseIterable<Integer>(values);
+
+ List<Integer> retrievedValues = Lists.newArrayList(iterable);
+
+ for (Integer n : iterable) {
+
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/impl/mr/MRPipelineTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/impl/mr/MRPipelineTest.java b/crunch-core/src/test/java/org/apache/crunch/impl/mr/MRPipelineTest.java
new file mode 100644
index 0000000..9ed7a46
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/impl/mr/MRPipelineTest.java
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr;
+
+import static org.junit.Assert.assertEquals;
+import static org.mockito.Mockito.doReturn;
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.when;
+
+import java.io.IOException;
+
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.impl.mr.collect.PCollectionImpl;
+import org.apache.crunch.impl.mr.run.RuntimeParameters;
+import org.apache.crunch.io.ReadableSourceTarget;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.mockito.Mock;
+import org.mockito.runners.MockitoJUnitRunner;
+
+
+@RunWith(MockitoJUnitRunner.class)
+public class MRPipelineTest {
+ @Rule
+ public TemporaryFolder tempDir = new TemporaryFolder();
+ @Mock
+ private PCollectionImpl<String> pcollection;
+ @Mock
+ private ReadableSourceTarget<String> readableSourceTarget;
+ @Mock
+ private SourceTarget<String> nonReadableSourceTarget;
+ private MRPipeline pipeline;
+
+ @Before
+ public void setUp() throws IOException {
+ Configuration conf = new Configuration();
+ conf.set(RuntimeParameters.TMP_DIR, tempDir.getRoot().getAbsolutePath());
+ pipeline = spy(new MRPipeline(MRPipelineTest.class, conf));
+ }
+
+ @Test
+ public void testGetMaterializeSourceTarget_AlreadyMaterialized() {
+ when(pcollection.getMaterializedAt()).thenReturn(readableSourceTarget);
+
+ assertEquals(readableSourceTarget, pipeline.getMaterializeSourceTarget(pcollection));
+ }
+
+ @Test
+ public void testGetMaterializeSourceTarget_NotMaterialized_HasOutput() {
+ when(pcollection.getPType()).thenReturn(Avros.strings());
+ doReturn(readableSourceTarget).when(pipeline).createIntermediateOutput(Avros.strings());
+ when(pcollection.getMaterializedAt()).thenReturn(null);
+
+ assertEquals(readableSourceTarget, pipeline.getMaterializeSourceTarget(pcollection));
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testGetMaterializeSourceTarget_NotMaterialized_NotReadableSourceTarget() {
+ when(pcollection.getPType()).thenReturn(Avros.strings());
+ doReturn(nonReadableSourceTarget).when(pipeline).createIntermediateOutput(Avros.strings());
+ when(pcollection.getMaterializedAt()).thenReturn(null);
+
+ pipeline.getMaterializeSourceTarget(pcollection);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/impl/mr/collect/DoCollectionImplTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/impl/mr/collect/DoCollectionImplTest.java b/crunch-core/src/test/java/org/apache/crunch/impl/mr/collect/DoCollectionImplTest.java
new file mode 100644
index 0000000..fd582bc
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/impl/mr/collect/DoCollectionImplTest.java
@@ -0,0 +1,112 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.collect;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.List;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.impl.mr.plan.DoNode;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.writable.Writables;
+import org.junit.Test;
+
+public class DoCollectionImplTest {
+
+ @Test
+ public void testGetSizeInternal_NoScaleFactor() {
+ runScaleTest(100L, 1.0f, 100L);
+ }
+
+ @Test
+ public void testGetSizeInternal_ScaleFactorBelowZero() {
+ runScaleTest(100L, 0.5f, 50L);
+ }
+
+ @Test
+ public void testGetSizeInternal_ScaleFactorAboveZero() {
+ runScaleTest(100L, 1.5f, 150L);
+ }
+
+ private void runScaleTest(long inputSize, float scaleFactor, long expectedScaledSize) {
+ PCollectionImpl<String> parentCollection = new SizedPCollectionImpl("Sized collection", inputSize);
+
+ DoCollectionImpl<String> doCollectionImpl = new DoCollectionImpl<String>("Scaled collection", parentCollection,
+ new ScaledFunction(scaleFactor), Writables.strings());
+
+ assertEquals(expectedScaledSize, doCollectionImpl.getSizeInternal());
+ }
+
+ static class ScaledFunction extends DoFn<String, String> {
+
+ private float scaleFactor;
+
+ public ScaledFunction(float scaleFactor) {
+ this.scaleFactor = scaleFactor;
+ }
+
+ @Override
+ public void process(String input, Emitter<String> emitter) {
+ emitter.emit(input);
+ }
+
+ @Override
+ public float scaleFactor() {
+ return scaleFactor;
+ }
+
+ }
+
+ static class SizedPCollectionImpl extends PCollectionImpl<String> {
+
+ private long internalSize;
+
+ public SizedPCollectionImpl(String name, long internalSize) {
+ super(name);
+ this.internalSize = internalSize;
+ }
+
+ @Override
+ public PType getPType() {
+ return null;
+ }
+
+ @Override
+ public DoNode createDoNode() {
+ return null;
+ }
+
+ @Override
+ public List getParents() {
+ return null;
+ }
+
+ @Override
+ protected void acceptInternal(Visitor visitor) {
+ }
+
+ @Override
+ protected long getSizeInternal() {
+ return internalSize;
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/impl/mr/collect/DoTableImplTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/impl/mr/collect/DoTableImplTest.java b/crunch-core/src/test/java/org/apache/crunch/impl/mr/collect/DoTableImplTest.java
new file mode 100644
index 0000000..89b9944
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/impl/mr/collect/DoTableImplTest.java
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.collect;
+
+import static org.apache.crunch.types.writable.Writables.strings;
+import static org.apache.crunch.types.writable.Writables.tableOf;
+import static org.junit.Assert.assertEquals;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.verifyNoMoreInteractions;
+import static org.mockito.Mockito.when;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.Pair;
+import org.junit.Test;
+
+public class DoTableImplTest {
+
+ @Test
+ public void testGetSizeInternal_NoScaleFactor() {
+ runScaleTest(100L, 1.0f, 100L);
+ }
+
+ @Test
+ public void testGetSizeInternal_ScaleFactorBelowZero() {
+ runScaleTest(100L, 0.5f, 50L);
+ }
+
+ @Test
+ public void testGetSizeInternal_ScaleFactorAboveZero() {
+ runScaleTest(100L, 1.5f, 150L);
+ }
+
+ private void runScaleTest(long inputSize, float scaleFactor, long expectedScaledSize) {
+
+ @SuppressWarnings("unchecked")
+ PCollectionImpl<String> parentCollection = (PCollectionImpl<String>) mock(PCollectionImpl.class);
+
+ when(parentCollection.getSize()).thenReturn(inputSize);
+
+ DoTableImpl<String, String> doTableImpl = new DoTableImpl<String, String>("Scalled table collection",
+ parentCollection, new TableScaledFunction(scaleFactor), tableOf(strings(), strings()));
+
+ assertEquals(expectedScaledSize, doTableImpl.getSizeInternal());
+
+ verify(parentCollection).getSize();
+
+ verifyNoMoreInteractions(parentCollection);
+ }
+
+ static class TableScaledFunction extends DoFn<String, Pair<String, String>> {
+
+ private float scaleFactor;
+
+ public TableScaledFunction(float scaleFactor) {
+ this.scaleFactor = scaleFactor;
+ }
+
+ @Override
+ public float scaleFactor() {
+ return scaleFactor;
+ }
+
+ @Override
+ public void process(String input, Emitter<Pair<String, String>> emitter) {
+ emitter.emit(Pair.of(input, input));
+
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/impl/mr/emit/IntermediateEmitterTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/impl/mr/emit/IntermediateEmitterTest.java b/crunch-core/src/test/java/org/apache/crunch/impl/mr/emit/IntermediateEmitterTest.java
new file mode 100644
index 0000000..dd72364
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/impl/mr/emit/IntermediateEmitterTest.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.emit;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotSame;
+import static org.junit.Assert.assertSame;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.verify;
+
+import org.apache.crunch.impl.mr.run.RTNode;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.hadoop.conf.Configuration;
+import org.junit.Before;
+import org.junit.Test;
+import org.mockito.ArgumentCaptor;
+
+import com.google.common.collect.Lists;
+
+public class IntermediateEmitterTest {
+
+ private StringWrapper stringWrapper;
+ private PType ptype;
+
+ @Before
+ public void setUp() {
+ stringWrapper = new StringWrapper("test");
+ ptype = spy(Avros.reflects(StringWrapper.class));
+ }
+
+ @Test
+ public void testEmit_SingleChild() {
+ RTNode singleChild = mock(RTNode.class);
+ IntermediateEmitter emitter = new IntermediateEmitter(ptype, Lists.newArrayList(singleChild),
+ new Configuration());
+ emitter.emit(stringWrapper);
+
+ ArgumentCaptor<StringWrapper> argumentCaptor = ArgumentCaptor.forClass(StringWrapper.class);
+ verify(singleChild).process(argumentCaptor.capture());
+ assertSame(stringWrapper, argumentCaptor.getValue());
+ }
+
+ @Test
+ public void testEmit_MultipleChildren() {
+ RTNode childA = mock(RTNode.class);
+ RTNode childB = mock(RTNode.class);
+ IntermediateEmitter emitter = new IntermediateEmitter(ptype, Lists.newArrayList(childA, childB),
+ new Configuration());
+ emitter.emit(stringWrapper);
+
+ ArgumentCaptor<StringWrapper> argumentCaptorA = ArgumentCaptor.forClass(StringWrapper.class);
+ ArgumentCaptor<StringWrapper> argumentCaptorB = ArgumentCaptor.forClass(StringWrapper.class);
+
+ verify(childA).process(argumentCaptorA.capture());
+ verify(childB).process(argumentCaptorB.capture());
+
+ assertEquals(stringWrapper, argumentCaptorA.getValue());
+ assertEquals(stringWrapper, argumentCaptorB.getValue());
+
+ // Make sure that multiple children means deep copies are performed
+ assertNotSame(stringWrapper, argumentCaptorA.getValue());
+ assertNotSame(stringWrapper, argumentCaptorB.getValue());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounterTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounterTest.java b/crunch-core/src/test/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounterTest.java
new file mode 100644
index 0000000..958df12
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/impl/mr/exec/CappedExponentialCounterTest.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.exec;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+public class CappedExponentialCounterTest {
+
+ @Test
+ public void testGet() {
+ CappedExponentialCounter c = new CappedExponentialCounter(1L, Long.MAX_VALUE);
+ assertEquals(1L, c.get());
+ assertEquals(2L, c.get());
+ assertEquals(4L, c.get());
+ assertEquals(8L, c.get());
+ }
+
+ @Test
+ public void testCap() {
+ CappedExponentialCounter c = new CappedExponentialCounter(1L, 2);
+ assertEquals(1L, c.get());
+ assertEquals(2L, c.get());
+ assertEquals(2L, c.get());
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/impl/mr/exec/CrunchJobHooksTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/impl/mr/exec/CrunchJobHooksTest.java b/crunch-core/src/test/java/org/apache/crunch/impl/mr/exec/CrunchJobHooksTest.java
new file mode 100644
index 0000000..f03c3e2
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/impl/mr/exec/CrunchJobHooksTest.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.exec;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+public class CrunchJobHooksTest {
+
+ @Test
+ public void testExtractPartitionNumber() {
+ assertEquals(0, CrunchJobHooks.extractPartitionNumber("out1-r-00000"));
+ assertEquals(10, CrunchJobHooks.extractPartitionNumber("out2-r-00010"));
+ assertEquals(99999, CrunchJobHooks.extractPartitionNumber("out3-r-99999"));
+ }
+
+ @Test
+ public void testExtractPartitionNumber_WithSuffix() {
+ assertEquals(10, CrunchJobHooks.extractPartitionNumber("out2-r-00010.avro"));
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testExtractPartitionNumber_MapOutputFile() {
+ CrunchJobHooks.extractPartitionNumber("out1-m-00000");
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/impl/mr/plan/DotfileWriterTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/impl/mr/plan/DotfileWriterTest.java b/crunch-core/src/test/java/org/apache/crunch/impl/mr/plan/DotfileWriterTest.java
new file mode 100644
index 0000000..562238d
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/impl/mr/plan/DotfileWriterTest.java
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.plan;
+
+import static org.junit.Assert.assertEquals;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import java.util.List;
+
+import org.apache.crunch.Source;
+import org.apache.crunch.Target;
+import org.apache.crunch.impl.mr.collect.InputCollection;
+import org.apache.crunch.impl.mr.collect.PCollectionImpl;
+import org.apache.crunch.impl.mr.plan.DotfileWriter.MRTaskType;
+import org.junit.Before;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import com.google.common.collect.Lists;
+
+public class DotfileWriterTest {
+
+ private DotfileWriter dotfileWriter;
+
+ @Before
+ public void setUp() {
+ dotfileWriter = new DotfileWriter();
+ }
+
+ @Test
+ public void testFormatPCollectionNodeDeclaration() {
+ PCollectionImpl<?> pcollectionImpl = mock(PCollectionImpl.class);
+ JobPrototype jobPrototype = mock(JobPrototype.class);
+ when(pcollectionImpl.getName()).thenReturn("collection");
+
+ assertEquals("\"collection@" + pcollectionImpl.hashCode() + "@" + jobPrototype.hashCode()
+ + "\" [label=\"collection\" shape=box];",
+ dotfileWriter.formatPCollectionNodeDeclaration(pcollectionImpl, jobPrototype));
+ }
+
+ @Test
+ public void testFormatPCollectionNodeDeclaration_InputPCollection() {
+ InputCollection<?> inputCollection = mock(InputCollection.class, Mockito.RETURNS_DEEP_STUBS);
+ JobPrototype jobPrototype = mock(JobPrototype.class);
+ when(inputCollection.getName()).thenReturn("input");
+ when(inputCollection.getSource().toString()).thenReturn("source");
+
+ assertEquals("\"source\" [label=\"input\" shape=folder];",
+ dotfileWriter.formatPCollectionNodeDeclaration(inputCollection, jobPrototype));
+ }
+
+ @Test
+ public void testFormatTargetNodeDeclaration() {
+ Target target = mock(Target.class);
+ when(target.toString()).thenReturn("target/path");
+
+ assertEquals("\"target/path\" [label=\"target/path\" shape=folder];",
+ dotfileWriter.formatTargetNodeDeclaration(target));
+ }
+
+ @Test
+ public void testFormatPCollection() {
+ PCollectionImpl<?> pcollectionImpl = mock(PCollectionImpl.class);
+ JobPrototype jobPrototype = mock(JobPrototype.class);
+ when(pcollectionImpl.getName()).thenReturn("collection");
+
+ assertEquals("\"collection@" + pcollectionImpl.hashCode() + "@" + jobPrototype.hashCode() + "\"",
+ dotfileWriter.formatPCollection(pcollectionImpl, jobPrototype));
+ }
+
+ @Test
+ public void testFormatPCollection_InputCollection() {
+ InputCollection<Object> inputCollection = mock(InputCollection.class);
+ Source<Object> source = mock(Source.class);
+ JobPrototype jobPrototype = mock(JobPrototype.class);
+ when(source.toString()).thenReturn("mocksource");
+ when(inputCollection.getSource()).thenReturn(source);
+
+ assertEquals("\"mocksource\"", dotfileWriter.formatPCollection(inputCollection, jobPrototype));
+ }
+
+ @Test
+ public void testFormatNodeCollection() {
+ List<String> nodeCollection = Lists.newArrayList("one", "two", "three");
+ assertEquals("one -> two -> three;", dotfileWriter.formatNodeCollection(nodeCollection));
+ }
+
+ @Test
+ public void testFormatNodePath() {
+ PCollectionImpl<?> tail = mock(PCollectionImpl.class);
+ PCollectionImpl<?> head = mock(PCollectionImpl.class);
+ JobPrototype jobPrototype = mock(JobPrototype.class);
+
+ when(tail.getName()).thenReturn("tail");
+ when(head.getName()).thenReturn("head");
+
+ NodePath nodePath = new NodePath(tail);
+ nodePath.close(head);
+
+ assertEquals(
+ Lists.newArrayList("\"head@" + head.hashCode() + "@" + jobPrototype.hashCode() + "\" -> \"tail@"
+ + tail.hashCode() + "@" + jobPrototype.hashCode() + "\";"),
+ dotfileWriter.formatNodePath(nodePath, jobPrototype));
+ }
+
+ @Test
+ public void testGetTaskGraphAttributes_Map() {
+ assertEquals("label = Map; color = blue;", dotfileWriter.getTaskGraphAttributes(MRTaskType.MAP));
+ }
+
+ @Test
+ public void testGetTaskGraphAttributes_Reduce() {
+ assertEquals("label = Reduce; color = red;", dotfileWriter.getTaskGraphAttributes(MRTaskType.REDUCE));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/impl/mr/plan/JobNameBuilderTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/impl/mr/plan/JobNameBuilderTest.java b/crunch-core/src/test/java/org/apache/crunch/impl/mr/plan/JobNameBuilderTest.java
new file mode 100644
index 0000000..7963c83
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/impl/mr/plan/JobNameBuilderTest.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.plan;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.crunch.types.writable.Writables;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class JobNameBuilderTest {
+
+ @Test
+ public void testBuild() {
+ final String pipelineName = "PipelineName";
+ final String nodeName = "outputNode";
+ DoNode doNode = DoNode.createOutputNode(nodeName, Writables.strings());
+ JobNameBuilder jobNameBuilder = new JobNameBuilder(pipelineName);
+ jobNameBuilder.visit(Lists.newArrayList(doNode));
+ String jobName = jobNameBuilder.build();
+
+ assertEquals(String.format("%s: %s", pipelineName, nodeName), jobName);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/io/SequentialFileNamingSchemeTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/io/SequentialFileNamingSchemeTest.java b/crunch-core/src/test/java/org/apache/crunch/io/SequentialFileNamingSchemeTest.java
new file mode 100644
index 0000000..467da15
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/io/SequentialFileNamingSchemeTest.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class SequentialFileNamingSchemeTest {
+
+ // The partition id used for testing. This partition id should be ignored by
+ // the SequentialFileNamingScheme.
+ private static final int PARTITION_ID = 42;
+
+ private SequentialFileNamingScheme namingScheme;
+ private Configuration configuration;
+
+ @Rule
+ public TemporaryFolder tmpOutputDir = new TemporaryFolder();
+
+ @Before
+ public void setUp() throws IOException {
+ configuration = new Configuration();
+ namingScheme = new SequentialFileNamingScheme();
+ }
+
+ @Test
+ public void testGetMapOutputName_EmptyDirectory() throws IOException {
+ assertEquals("part-m-00000",
+ namingScheme.getMapOutputName(configuration, new Path(tmpOutputDir.getRoot().getAbsolutePath())));
+ }
+
+ @Test
+ public void testGetMapOutputName_NonEmptyDirectory() throws IOException {
+ File outputDirectory = tmpOutputDir.getRoot();
+
+ new File(outputDirectory, "existing-1").createNewFile();
+ new File(outputDirectory, "existing-2").createNewFile();
+
+ assertEquals("part-m-00002",
+ namingScheme.getMapOutputName(configuration, new Path(outputDirectory.getAbsolutePath())));
+ }
+
+ @Test
+ public void testGetReduceOutputName_EmptyDirectory() throws IOException {
+ assertEquals("part-r-00000", namingScheme.getReduceOutputName(configuration, new Path(tmpOutputDir.getRoot()
+ .getAbsolutePath()), PARTITION_ID));
+ }
+
+ @Test
+ public void testGetReduceOutputName_NonEmptyDirectory() throws IOException {
+ File outputDirectory = tmpOutputDir.getRoot();
+
+ new File(outputDirectory, "existing-1").createNewFile();
+ new File(outputDirectory, "existing-2").createNewFile();
+
+ assertEquals("part-r-00002",
+ namingScheme.getReduceOutputName(configuration, new Path(outputDirectory.getAbsolutePath()), PARTITION_ID));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/io/SourceTargetHelperTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/io/SourceTargetHelperTest.java b/crunch-core/src/test/java/org/apache/crunch/io/SourceTargetHelperTest.java
new file mode 100644
index 0000000..5b0ea55
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/io/SourceTargetHelperTest.java
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocalFileSystem;
+import org.apache.hadoop.fs.Path;
+import org.junit.Test;
+
+public class SourceTargetHelperTest {
+
+ @Test
+ public void testGetNonexistentPathSize() throws Exception {
+ File tmp = File.createTempFile("pathsize", "");
+ Path tmpPath = new Path(tmp.getAbsolutePath());
+ tmp.delete();
+ FileSystem fs = FileSystem.getLocal(new Configuration());
+ assertEquals(-1L, SourceTargetHelper.getPathSize(fs, tmpPath));
+ }
+
+ @Test
+ public void testGetNonExistentPathSize_NonExistantPath() throws IOException {
+ FileSystem mockFs = new MockFileSystem();
+ assertEquals(-1L, SourceTargetHelper.getPathSize(mockFs, new Path("does/not/exist")));
+ }
+
+ /**
+ * Mock FileSystem that returns null for {@link FileSystem#listStatus(Path)}.
+ */
+ static class MockFileSystem extends LocalFileSystem {
+
+ @Override
+ public FileStatus[] listStatus(Path f) throws IOException {
+ return null;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/io/avro/AvroFileReaderFactoryTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/io/avro/AvroFileReaderFactoryTest.java b/crunch-core/src/test/java/org/apache/crunch/io/avro/AvroFileReaderFactoryTest.java
new file mode 100644
index 0000000..62085f8
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/io/avro/AvroFileReaderFactoryTest.java
@@ -0,0 +1,184 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.avro;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.avro.Schema;
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericData.Record;
+import org.apache.avro.generic.GenericDatumReader;
+import org.apache.avro.generic.GenericDatumWriter;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.reflect.ReflectData;
+import org.apache.avro.reflect.ReflectDatumReader;
+import org.apache.avro.specific.SpecificDatumReader;
+import org.apache.crunch.Pair;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.types.avro.AvroType;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.junit.After;
+import org.junit.Assume;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class AvroFileReaderFactoryTest {
+
+ private File avroFile;
+
+ @Before
+ public void setUp() throws IOException {
+ avroFile = File.createTempFile("test", ".av");
+ }
+
+ @After
+ public void tearDown() {
+ avroFile.delete();
+ }
+
+ private void populateGenericFile(List<GenericRecord> genericRecords, Schema outputSchema) throws IOException {
+ FileOutputStream outputStream = new FileOutputStream(this.avroFile);
+ GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<GenericRecord>(outputSchema);
+
+ DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(genericDatumWriter);
+ dataFileWriter.create(outputSchema, outputStream);
+
+ for (GenericRecord record : genericRecords) {
+ dataFileWriter.append(record);
+ }
+
+ dataFileWriter.close();
+ outputStream.close();
+
+ }
+
+ private <T> AvroFileReaderFactory<T> createFileReaderFactory(AvroType<T> avroType) {
+ return new AvroFileReaderFactory<T>(avroType);
+ }
+
+ @Test
+ public void testRead_GenericReader() throws IOException {
+ GenericRecord savedRecord = new GenericData.Record(Person.SCHEMA$);
+ savedRecord.put("name", "John Doe");
+ savedRecord.put("age", 42);
+ savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
+ populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);
+
+ AvroFileReaderFactory<GenericData.Record> genericReader = createFileReaderFactory(Avros.generics(Person.SCHEMA$));
+ Iterator<GenericData.Record> recordIterator = genericReader.read(FileSystem.getLocal(new Configuration()),
+ new Path(this.avroFile.getAbsolutePath()));
+
+ GenericRecord genericRecord = recordIterator.next();
+ assertEquals(savedRecord, genericRecord);
+ assertFalse(recordIterator.hasNext());
+ }
+
+ @Test
+ public void testRead_SpecificReader() throws IOException {
+ GenericRecord savedRecord = new GenericData.Record(Person.SCHEMA$);
+ savedRecord.put("name", "John Doe");
+ savedRecord.put("age", 42);
+ savedRecord.put("siblingnames", Lists.newArrayList("Jimmy", "Jane"));
+ populateGenericFile(Lists.newArrayList(savedRecord), Person.SCHEMA$);
+
+ AvroFileReaderFactory<Person> genericReader = createFileReaderFactory(Avros.records(Person.class));
+ Iterator<Person> recordIterator = genericReader.read(FileSystem.getLocal(new Configuration()), new Path(
+ this.avroFile.getAbsolutePath()));
+
+ Person expectedPerson = new Person();
+ expectedPerson.age = 42;
+ expectedPerson.name = "John Doe";
+ List<CharSequence> siblingNames = Lists.newArrayList();
+ siblingNames.add("Jimmy");
+ siblingNames.add("Jane");
+ expectedPerson.siblingnames = siblingNames;
+
+ Person person = recordIterator.next();
+
+ assertEquals(expectedPerson, person);
+ assertFalse(recordIterator.hasNext());
+ }
+
+ @Test
+ public void testRead_ReflectReader() throws IOException {
+ Schema reflectSchema = ReflectData.get().getSchema(StringWrapper.class);
+ GenericRecord savedRecord = new GenericData.Record(reflectSchema);
+ savedRecord.put("value", "stringvalue");
+ populateGenericFile(Lists.newArrayList(savedRecord), reflectSchema);
+
+ AvroFileReaderFactory<StringWrapper> genericReader = createFileReaderFactory(Avros.reflects(StringWrapper.class));
+ Iterator<StringWrapper> recordIterator = genericReader.read(FileSystem.getLocal(new Configuration()), new Path(
+ this.avroFile.getAbsolutePath()));
+
+ StringWrapper stringWrapper = recordIterator.next();
+
+ assertEquals("stringvalue", stringWrapper.getValue());
+ assertFalse(recordIterator.hasNext());
+ }
+
+ @Test
+ public void testCreateDatumReader_Generic() {
+ DatumReader<Record> datumReader = AvroFileReaderFactory.createDatumReader(Avros.generics(Person.SCHEMA$));
+ assertEquals(GenericDatumReader.class, datumReader.getClass());
+ }
+
+ @Test
+ public void testCreateDatumReader_Reflect() {
+ DatumReader<StringWrapper> datumReader = AvroFileReaderFactory.createDatumReader(Avros
+ .reflects(StringWrapper.class));
+ assertEquals(ReflectDatumReader.class, datumReader.getClass());
+ }
+
+ @Test
+ public void testCreateDatumReader_Specific() {
+ DatumReader<Person> datumReader = AvroFileReaderFactory.createDatumReader(Avros.records(Person.class));
+ assertEquals(SpecificDatumReader.class, datumReader.getClass());
+ }
+
+ @Test
+ public void testCreateDatumReader_ReflectAndSpecific() {
+ Assume.assumeTrue(Avros.CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS);
+
+ DatumReader<Pair<Person, StringWrapper>> datumReader = AvroFileReaderFactory.createDatumReader(Avros.pairs(
+ Avros.records(Person.class), Avros.reflects(StringWrapper.class)));
+ assertEquals(ReflectDatumReader.class, datumReader.getClass());
+ }
+
+ @Test(expected = IllegalStateException.class)
+ public void testCreateDatumReader_ReflectAndSpecific_NotSupported() {
+ Assume.assumeTrue(!Avros.CAN_COMBINE_SPECIFIC_AND_REFLECT_SCHEMAS);
+ AvroFileReaderFactory.createDatumReader(Avros.pairs(Avros.records(Person.class),
+ Avros.reflects(StringWrapper.class)));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/io/avro/AvroFileSourceTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/io/avro/AvroFileSourceTest.java b/crunch-core/src/test/java/org/apache/crunch/io/avro/AvroFileSourceTest.java
new file mode 100644
index 0000000..ceef2b2
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/io/avro/AvroFileSourceTest.java
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.avro;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.avro.generic.GenericData.Record;
+import org.apache.avro.mapred.AvroJob;
+import org.apache.crunch.test.Person;
+import org.apache.crunch.test.StringWrapper;
+import org.apache.crunch.types.avro.AvroType;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class AvroFileSourceTest {
+
+ private Job job;
+ File tempFile;
+
+ @Before
+ public void setUp() throws IOException {
+ job = new Job();
+ tempFile = File.createTempFile("test", ".avr");
+ }
+
+ @After
+ public void tearDown() {
+ tempFile.delete();
+ }
+
+ @Test
+ public void testConfigureJob_SpecificData() throws IOException {
+ AvroType<Person> avroSpecificType = Avros.records(Person.class);
+ AvroFileSource<Person> personFileSource = new AvroFileSource<Person>(new Path(tempFile.getAbsolutePath()),
+ avroSpecificType);
+
+ personFileSource.configureSource(job, -1);
+
+ assertFalse(job.getConfiguration().getBoolean(AvroJob.INPUT_IS_REFLECT, true));
+ assertEquals(Person.SCHEMA$.toString(), job.getConfiguration().get(AvroJob.INPUT_SCHEMA));
+ }
+
+ @Test
+ public void testConfigureJob_GenericData() throws IOException {
+ AvroType<Record> avroGenericType = Avros.generics(Person.SCHEMA$);
+ AvroFileSource<Record> personFileSource = new AvroFileSource<Record>(new Path(tempFile.getAbsolutePath()),
+ avroGenericType);
+
+ personFileSource.configureSource(job, -1);
+
+ assertFalse(job.getConfiguration().getBoolean(AvroJob.INPUT_IS_REFLECT, true));
+
+ }
+
+ @Test
+ public void testConfigureJob_ReflectData() throws IOException {
+ AvroType<StringWrapper> avroReflectType = Avros.reflects(StringWrapper.class);
+ AvroFileSource<StringWrapper> personFileSource = new AvroFileSource<StringWrapper>(new Path(
+ tempFile.getAbsolutePath()), avroReflectType);
+
+ personFileSource.configureSource(job, -1);
+
+ assertTrue(job.getConfiguration().getBoolean(AvroJob.INPUT_IS_REFLECT, false));
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/lib/AvroIndexedRecordPartitionerTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/lib/AvroIndexedRecordPartitionerTest.java b/crunch-core/src/test/java/org/apache/crunch/lib/AvroIndexedRecordPartitionerTest.java
new file mode 100644
index 0000000..0dfed32
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/lib/AvroIndexedRecordPartitionerTest.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.IndexedRecord;
+import org.apache.avro.mapred.AvroKey;
+import org.apache.avro.mapred.AvroValue;
+import org.apache.crunch.lib.join.JoinUtils.AvroIndexedRecordPartitioner;
+import org.junit.Before;
+import org.junit.Test;
+
+public class AvroIndexedRecordPartitionerTest {
+
+ private AvroIndexedRecordPartitioner avroPartitioner;
+
+ @Before
+ public void setUp() {
+ avroPartitioner = new AvroIndexedRecordPartitioner();
+ }
+
+ @Test
+ public void testGetPartition() {
+ IndexedRecord indexedRecord = new MockIndexedRecord(3);
+ AvroKey<IndexedRecord> avroKey = new AvroKey<IndexedRecord>(indexedRecord);
+
+ assertEquals(3, avroPartitioner.getPartition(avroKey, new AvroValue<Object>(), 5));
+ assertEquals(1, avroPartitioner.getPartition(avroKey, new AvroValue<Object>(), 2));
+ }
+
+ @Test
+ public void testGetPartition_NegativeHashValue() {
+ IndexedRecord indexedRecord = new MockIndexedRecord(-3);
+ AvroKey<IndexedRecord> avroKey = new AvroKey<IndexedRecord>(indexedRecord);
+
+ assertEquals(3, avroPartitioner.getPartition(avroKey, new AvroValue<Object>(), 5));
+ assertEquals(1, avroPartitioner.getPartition(avroKey, new AvroValue<Object>(), 2));
+ }
+
+ @Test
+ public void testGetPartition_IntegerMinValue() {
+ IndexedRecord indexedRecord = new MockIndexedRecord(Integer.MIN_VALUE);
+ AvroKey<IndexedRecord> avroKey = new AvroKey<IndexedRecord>(indexedRecord);
+
+ assertEquals(0, avroPartitioner.getPartition(avroKey, new AvroValue<Object>(), Integer.MAX_VALUE));
+ }
+
+ /**
+ * Mock implementation of IndexedRecord to give us control over the hashCode.
+ */
+ static class MockIndexedRecord implements IndexedRecord {
+
+ private Integer value;
+
+ public MockIndexedRecord(Integer value) {
+ this.value = value;
+ }
+
+ @Override
+ public int hashCode() {
+ return value.hashCode();
+ }
+
+ @Override
+ public Schema getSchema() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public Object get(int arg0) {
+ return this.value;
+ }
+
+ @Override
+ public void put(int arg0, Object arg1) {
+ throw new UnsupportedOperationException();
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/lib/CartesianTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/lib/CartesianTest.java b/crunch-core/src/test/java/org/apache/crunch/lib/CartesianTest.java
new file mode 100644
index 0000000..b19097c
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/lib/CartesianTest.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.types.writable.Writables;
+import org.junit.Test;
+
+import com.google.common.collect.Lists;
+
+public class CartesianTest {
+
+ @Test
+ public void testCartesianCollection_SingleValues() {
+
+ PCollection<String> letters = MemPipeline.typedCollectionOf(Writables.strings(), "a", "b");
+ PCollection<Integer> ints = MemPipeline.typedCollectionOf(Writables.ints(), 1, 2);
+
+ PCollection<Pair<String, Integer>> cartesianProduct = Cartesian.cross(letters, ints);
+
+ @SuppressWarnings("unchecked")
+ List<Pair<String, Integer>> expectedResults = Lists.newArrayList(Pair.of("a", 1), Pair.of("a", 2), Pair.of("b", 1),
+ Pair.of("b", 2));
+ List<Pair<String, Integer>> actualResults = Lists.newArrayList(cartesianProduct.materialize());
+ Collections.sort(actualResults);
+
+ assertEquals(expectedResults, actualResults);
+ }
+
+ @Test
+ public void testCartesianCollection_Tables() {
+
+ PTable<String, Integer> leftTable = MemPipeline.typedTableOf(
+ Writables.tableOf(Writables.strings(), Writables.ints()), "a", 1, "b", 2);
+ PTable<String, Float> rightTable = MemPipeline.typedTableOf(
+ Writables.tableOf(Writables.strings(), Writables.floats()), "A", 1.0f, "B", 2.0f);
+
+ PTable<Pair<String, String>, Pair<Integer, Float>> cartesianProduct = Cartesian.cross(leftTable, rightTable);
+
+ List<Pair<Pair<String, String>, Pair<Integer, Float>>> expectedResults = Lists.newArrayList();
+ expectedResults.add(Pair.of(Pair.of("a", "A"), Pair.of(1, 1.0f)));
+ expectedResults.add(Pair.of(Pair.of("a", "B"), Pair.of(1, 2.0f)));
+ expectedResults.add(Pair.of(Pair.of("b", "A"), Pair.of(2, 1.0f)));
+ expectedResults.add(Pair.of(Pair.of("b", "B"), Pair.of(2, 2.0f)));
+
+ List<Pair<Pair<String, String>, Pair<Integer, Float>>> actualResults = Lists.newArrayList(cartesianProduct
+ .materialize());
+ Collections.sort(actualResults);
+
+ assertEquals(expectedResults, actualResults);
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/lib/DistinctTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/lib/DistinctTest.java b/crunch-core/src/test/java/org/apache/crunch/lib/DistinctTest.java
new file mode 100644
index 0000000..8c0b3bf
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/lib/DistinctTest.java
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.crunch.PCollection;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.types.avro.Avros;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableSet;
+
+public class DistinctTest {
+ private static final List<Integer> DATA = Arrays.asList(
+ 17, 29, 17, 29, 17, 29, 36, 45, 17, 45, 36, 29
+ );
+
+ @Test
+ public void testDistinct() {
+ PCollection<Integer> input = MemPipeline.typedCollectionOf(Avros.ints(), DATA);
+ Iterable<Integer> unique = Distinct.distinct(input).materialize();
+
+ assertEquals(ImmutableSet.copyOf(DATA), ImmutableSet.copyOf(unique));
+ }
+
+ @Test
+ public void testDistinctFlush() {
+ PCollection<Integer> input = MemPipeline.typedCollectionOf(Avros.ints(), DATA);
+ Iterable<Integer> unique = Distinct.distinct(input, 2).materialize();
+
+ assertEquals(ImmutableSet.copyOf(DATA), ImmutableSet.copyOf(unique));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/lib/SampleTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/lib/SampleTest.java b/crunch-core/src/test/java/org/apache/crunch/lib/SampleTest.java
new file mode 100644
index 0000000..bd6fd81
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/lib/SampleTest.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.crunch.PCollection;
+import org.apache.crunch.Pair;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.types.writable.Writables;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Maps;
+
+public class SampleTest {
+ private PCollection<Pair<String, Double>> values = MemPipeline.typedCollectionOf(
+ Writables.pairs(Writables.strings(), Writables.doubles()),
+ ImmutableList.of(
+ Pair.of("foo", 200.0),
+ Pair.of("bar", 400.0),
+ Pair.of("baz", 100.0),
+ Pair.of("biz", 100.0)));
+
+ @Test
+ public void testWRS() throws Exception {
+ Map<String, Integer> histogram = Maps.newHashMap();
+
+ for (int i = 0; i < 100; i++) {
+ PCollection<String> sample = Sample.weightedReservoirSample(values, 1, 1729L + i);
+ for (String s : sample.materialize()) {
+ if (!histogram.containsKey(s)) {
+ histogram.put(s, 1);
+ } else {
+ histogram.put(s, 1 + histogram.get(s));
+ }
+ }
+ }
+
+ Map<String, Integer> expected = ImmutableMap.of(
+ "foo", 24, "bar", 51, "baz", 13, "biz", 12);
+ assertEquals(expected, histogram);
+ }
+
+ @Test
+ public void testSample() {
+ PCollection<Integer> pcollect = MemPipeline.collectionOf(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+ Iterable<Integer> sample = Sample.sample(pcollect, 123998L, 0.2).materialize();
+ List<Integer> sampleValues = ImmutableList.copyOf(sample);
+ assertEquals(ImmutableList.of(6, 7), sampleValues);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/lib/SecondarySortTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/lib/SecondarySortTest.java b/crunch-core/src/test/java/org/apache/crunch/lib/SecondarySortTest.java
new file mode 100644
index 0000000..933b986
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/lib/SecondarySortTest.java
@@ -0,0 +1,53 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static org.apache.crunch.types.avro.Avros.*;
+import static org.junit.Assert.assertEquals;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableList;
+
+
+public class SecondarySortTest {
+ @Test
+ public void testInMemory() throws Exception {
+ PTable<Long, Pair<Long, String>> input = MemPipeline.typedTableOf(tableOf(longs(), pairs(longs(), strings())),
+ 1729L, Pair.of(17L, "a"), 100L, Pair.of(29L, "b"), 1729L, Pair.of(29L, "c"));
+ PCollection<String> letters = SecondarySort.sortAndApply(input, new StringifyFn(), strings());
+ assertEquals(ImmutableList.of("b", "ac"), letters.materialize());
+ }
+
+ private static class StringifyFn extends DoFn<Pair<Long, Iterable<Pair<Long, String>>>, String> {
+ @Override
+ public void process(Pair<Long, Iterable<Pair<Long, String>>> input, Emitter<String> emitter) {
+ StringBuilder sb = new StringBuilder();
+ for (Pair<Long, String> p : input.second()) {
+ sb.append(p.second());
+ }
+ emitter.emit(sb.toString());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/lib/TupleWritablePartitionerTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/lib/TupleWritablePartitionerTest.java b/crunch-core/src/test/java/org/apache/crunch/lib/TupleWritablePartitionerTest.java
new file mode 100644
index 0000000..35ccc11
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/lib/TupleWritablePartitionerTest.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.lib;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.crunch.lib.join.JoinUtils.TupleWritablePartitioner;
+import org.apache.crunch.types.writable.TupleWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Writable;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TupleWritablePartitionerTest {
+
+ private TupleWritablePartitioner tupleWritableParitioner;
+
+ @Before
+ public void setUp() {
+ tupleWritableParitioner = new TupleWritablePartitioner();
+ }
+
+ @Test
+ public void testGetPartition() {
+ IntWritable intWritable = new IntWritable(3);
+ TupleWritable key = new TupleWritable(new Writable[] { intWritable });
+ assertEquals(3, tupleWritableParitioner.getPartition(key, NullWritable.get(), 5));
+ assertEquals(1, tupleWritableParitioner.getPartition(key, NullWritable.get(), 2));
+ }
+
+ @Test
+ public void testGetPartition_NegativeHashValue() {
+ IntWritable intWritable = new IntWritable(-3);
+ // Sanity check, if this doesn't work then the premise of this test is wrong
+ assertEquals(-3, intWritable.hashCode());
+
+ TupleWritable key = new TupleWritable(new Writable[] { intWritable });
+ assertEquals(3, tupleWritableParitioner.getPartition(key, NullWritable.get(), 5));
+ assertEquals(1, tupleWritableParitioner.getPartition(key, NullWritable.get(), 2));
+ }
+
+ @Test
+ public void testGetPartition_IntegerMinValue() {
+ IntWritable intWritable = new IntWritable(Integer.MIN_VALUE);
+ // Sanity check, if this doesn't work then the premise of this test is wrong
+ assertEquals(Integer.MIN_VALUE, intWritable.hashCode());
+
+ TupleWritable key = new TupleWritable(new Writable[] { intWritable });
+ assertEquals(0, tupleWritableParitioner.getPartition(key, NullWritable.get(), Integer.MAX_VALUE));
+ }
+
+}
[06/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/PTypeUtils.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/PTypeUtils.java b/crunch/src/main/java/org/apache/crunch/types/PTypeUtils.java
deleted file mode 100644
index e61b98b..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/PTypeUtils.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-
-/**
- * Utilities for converting between {@code PType}s from different
- * {@code PTypeFamily} implementations.
- *
- */
-public class PTypeUtils {
-
- public static <T> PType<T> convert(PType<T> ptype, PTypeFamily tf) {
- if (ptype instanceof PTableType) {
- PTableType ptt = (PTableType) ptype;
- return tf.tableOf(tf.as(ptt.getKeyType()), tf.as(ptt.getValueType()));
- }
- Class<T> typeClass = ptype.getTypeClass();
- if (Tuple.class.isAssignableFrom(typeClass)) {
- List<PType> subTypes = ptype.getSubTypes();
- if (Pair.class.equals(typeClass)) {
- return tf.pairs(tf.as(subTypes.get(0)), tf.as(subTypes.get(1)));
- } else if (Tuple3.class.equals(typeClass)) {
- return tf.triples(tf.as(subTypes.get(0)), tf.as(subTypes.get(1)), tf.as(subTypes.get(2)));
- } else if (Tuple4.class.equals(typeClass)) {
- return tf.quads(tf.as(subTypes.get(0)), tf.as(subTypes.get(1)), tf.as(subTypes.get(2)), tf.as(subTypes.get(3)));
- } else if (TupleN.class.equals(typeClass)) {
- PType[] newPTypes = subTypes.toArray(new PType[0]);
- for (int i = 0; i < newPTypes.length; i++) {
- newPTypes[i] = tf.as(subTypes.get(i));
- }
- return (PType<T>) tf.tuples(newPTypes);
- }
- }
- if (Collection.class.isAssignableFrom(typeClass)) {
- return tf.collections(tf.as(ptype.getSubTypes().get(0)));
- }
- return tf.records(typeClass);
- }
-
- private PTypeUtils() {
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/PTypes.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/PTypes.java b/crunch/src/main/java/org/apache/crunch/types/PTypes.java
deleted file mode 100644
index 546719c..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/PTypes.java
+++ /dev/null
@@ -1,252 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import java.math.BigInteger;
-import java.nio.ByteBuffer;
-import java.util.UUID;
-
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.MapFn;
-import org.apache.hadoop.util.ReflectionUtils;
-import org.apache.thrift.TBase;
-import org.apache.thrift.TDeserializer;
-import org.apache.thrift.TException;
-import org.apache.thrift.TSerializer;
-import org.apache.thrift.protocol.TBinaryProtocol;
-import org.codehaus.jackson.map.ObjectMapper;
-
-import com.google.protobuf.InvalidProtocolBufferException;
-import com.google.protobuf.Message;
-
-/**
- * Utility functions for creating common types of derived PTypes, e.g., for JSON
- * data, protocol buffers, and Thrift records.
- *
- */
-public class PTypes {
-
- public static PType<BigInteger> bigInt(PTypeFamily typeFamily) {
- return typeFamily.derived(BigInteger.class, BYTE_TO_BIGINT, BIGINT_TO_BYTE, typeFamily.bytes());
- }
-
- public static PType<UUID> uuid(PTypeFamily ptf) {
- return ptf.derived(UUID.class, BYTE_TO_UUID, UUID_TO_BYTE, ptf.bytes());
- }
-
- public static <T> PType<T> jsonString(Class<T> clazz, PTypeFamily typeFamily) {
- return typeFamily
- .derived(clazz, new JacksonInputMapFn<T>(clazz), new JacksonOutputMapFn<T>(), typeFamily.strings());
- }
-
- public static <T extends Message> PType<T> protos(Class<T> clazz, PTypeFamily typeFamily) {
- return typeFamily.derived(clazz, new ProtoInputMapFn<T>(clazz), new ProtoOutputMapFn<T>(), typeFamily.bytes());
- }
-
- public static <T extends TBase> PType<T> thrifts(Class<T> clazz, PTypeFamily typeFamily) {
- return typeFamily.derived(clazz, new ThriftInputMapFn<T>(clazz), new ThriftOutputMapFn<T>(), typeFamily.bytes());
- }
-
- public static final <T extends Enum> PType<T> enums(final Class<T> type, PTypeFamily typeFamily) {
- return typeFamily.derived(type, new EnumInputMapper<T>(type), new EnumOutputMapper<T>(), typeFamily.strings());
- }
-
- public static MapFn<ByteBuffer, BigInteger> BYTE_TO_BIGINT = new MapFn<ByteBuffer, BigInteger>() {
- public BigInteger map(ByteBuffer input) {
- return input == null ? null : new BigInteger(input.array());
- }
- };
-
- public static MapFn<BigInteger, ByteBuffer> BIGINT_TO_BYTE = new MapFn<BigInteger, ByteBuffer>() {
- public ByteBuffer map(BigInteger input) {
- return input == null ? null : ByteBuffer.wrap(input.toByteArray());
- }
- };
-
- private static class JacksonInputMapFn<T> extends MapFn<String, T> {
-
- private final Class<T> clazz;
- private transient ObjectMapper mapper;
-
- public JacksonInputMapFn(Class<T> clazz) {
- this.clazz = clazz;
- }
-
- @Override
- public void initialize() {
- this.mapper = new ObjectMapper();
- }
-
- @Override
- public T map(String input) {
- try {
- return mapper.readValue(input, clazz);
- } catch (Exception e) {
- throw new CrunchRuntimeException(e);
- }
- }
- }
-
- private static class JacksonOutputMapFn<T> extends MapFn<T, String> {
-
- private transient ObjectMapper mapper;
-
- @Override
- public void initialize() {
- this.mapper = new ObjectMapper();
- }
-
- @Override
- public String map(T input) {
- try {
- return mapper.writeValueAsString(input);
- } catch (Exception e) {
- throw new CrunchRuntimeException(e);
- }
- }
- }
-
- private static class ProtoInputMapFn<T extends Message> extends MapFn<ByteBuffer, T> {
-
- private final Class<T> clazz;
- private transient T instance;
-
- public ProtoInputMapFn(Class<T> clazz) {
- this.clazz = clazz;
- }
-
- @Override
- public void initialize() {
- this.instance = Protos.getDefaultInstance(clazz);
- }
-
- @Override
- public T map(ByteBuffer bb) {
- try {
- return (T) instance.newBuilderForType().mergeFrom(bb.array(), bb.position(), bb.limit()).build();
- } catch (InvalidProtocolBufferException e) {
- throw new CrunchRuntimeException(e);
- }
- }
- }
-
- private static class ProtoOutputMapFn<T extends Message> extends MapFn<T, ByteBuffer> {
-
- public ProtoOutputMapFn() {
- }
-
- @Override
- public ByteBuffer map(T proto) {
- return ByteBuffer.wrap(proto.toByteArray());
- }
- }
-
- private static class ThriftInputMapFn<T extends TBase> extends MapFn<ByteBuffer, T> {
-
- private final Class<T> clazz;
- private transient T instance;
- private transient TDeserializer deserializer;
- private transient byte[] bytes;
-
- public ThriftInputMapFn(Class<T> clazz) {
- this.clazz = clazz;
- }
-
- @Override
- public void initialize() {
- this.instance = ReflectionUtils.newInstance(clazz, null);
- this.deserializer = new TDeserializer(new TBinaryProtocol.Factory());
- this.bytes = new byte[0];
- }
-
- @Override
- public T map(ByteBuffer bb) {
- T next = (T) instance.deepCopy();
- int len = bb.limit() - bb.position();
- if (len != bytes.length) {
- bytes = new byte[len];
- }
- System.arraycopy(bb.array(), bb.position(), bytes, 0, len);
- try {
- deserializer.deserialize(next, bytes);
- } catch (TException e) {
- throw new CrunchRuntimeException(e);
- }
- return next;
- }
- }
-
- private static class ThriftOutputMapFn<T extends TBase> extends MapFn<T, ByteBuffer> {
-
- private transient TSerializer serializer;
-
- public ThriftOutputMapFn() {
- }
-
- @Override
- public void initialize() {
- this.serializer = new TSerializer(new TBinaryProtocol.Factory());
- }
-
- @Override
- public ByteBuffer map(T t) {
- try {
- return ByteBuffer.wrap(serializer.serialize(t));
- } catch (TException e) {
- throw new CrunchRuntimeException(e);
- }
- }
- }
-
- private static class EnumInputMapper<T extends Enum> extends MapFn<String, T> {
- private final Class<T> type;
-
- public EnumInputMapper(Class<T> type) {
- this.type = type;
- }
-
- @Override
- public T map(String input) {
- return (T) Enum.valueOf(type, input);
- }
- };
-
- private static class EnumOutputMapper<T extends Enum> extends MapFn<T, String> {
-
- @Override
- public String map(T input) {
- return input.name();
- }
- };
-
- private static MapFn<ByteBuffer, UUID> BYTE_TO_UUID = new MapFn<ByteBuffer, UUID>() {
- @Override
- public UUID map(ByteBuffer input) {
- return new UUID(input.getLong(), input.getLong());
- }
- };
-
- private static MapFn<UUID, ByteBuffer> UUID_TO_BYTE = new MapFn<UUID, ByteBuffer>() {
- @Override
- public ByteBuffer map(UUID input) {
- ByteBuffer bb = ByteBuffer.wrap(new byte[16]);
- bb.asLongBuffer().put(input.getMostSignificantBits()).put(input.getLeastSignificantBits());
- return bb;
- }
- };
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/Protos.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/Protos.java b/crunch/src/main/java/org/apache/crunch/types/Protos.java
deleted file mode 100644
index 4cd5068..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/Protos.java
+++ /dev/null
@@ -1,173 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.MapFn;
-import org.apache.hadoop.util.ReflectionUtils;
-
-import com.google.common.base.Splitter;
-import com.google.protobuf.Descriptors.FieldDescriptor;
-import com.google.protobuf.Message;
-import com.google.protobuf.Message.Builder;
-
-/**
- * Utility functions for working with protocol buffers in Crunch.
- */
-public class Protos {
-
- /**
- * Utility function for creating a default PB Messgae from a Class object that
- * works with both protoc 2.3.0 and 2.4.x.
- * @param clazz The class of the protocol buffer to create
- * @return An instance of a protocol buffer
- */
- public static <M extends Message> M getDefaultInstance(Class<M> clazz) {
- if (clazz.getConstructors().length > 0) {
- // Protobuf 2.3.0
- return ReflectionUtils.newInstance(clazz, null);
- } else {
- // Protobuf 2.4.x
- try {
- Message.Builder mb = (Message.Builder) clazz.getDeclaredMethod("newBuilder").invoke(null);
- return (M) mb.getDefaultInstanceForType();
- } catch (Exception e) {
- throw new CrunchRuntimeException(e);
- }
- }
- }
-
- public static <M extends Message, K> MapFn<M, K> extractKey(String fieldName) {
- return new ExtractKeyFn<M, K>(fieldName);
- }
-
- public static <M extends Message> DoFn<String, M> lineParser(String sep, Class<M> msgClass) {
- return new TextToProtoFn<M>(sep, msgClass);
- }
-
- private static class ExtractKeyFn<M extends Message, K> extends MapFn<M, K> {
-
- private final String fieldName;
-
- private transient FieldDescriptor fd;
-
- public ExtractKeyFn(String fieldName) {
- this.fieldName = fieldName;
- }
-
- @Override
- public K map(M input) {
- if (input == null) {
- throw new IllegalArgumentException("Null inputs not supported by Protos.ExtractKeyFn");
- } else if (fd == null) {
- fd = input.getDescriptorForType().findFieldByName(fieldName);
- if (fd == null) {
- throw new IllegalStateException("Could not find field: " + fieldName + " in message: " + input);
- }
- }
- return (K) input.getField(fd);
- }
-
- }
-
- private static class TextToProtoFn<M extends Message> extends DoFn<String, M> {
-
- private final String sep;
- private final Class<M> msgClass;
-
- private transient M msgInstance;
- private transient List<FieldDescriptor> fields;
- private transient Splitter splitter;
-
- enum ParseErrors {
- TOTAL,
- NUMBER_FORMAT
- };
-
- public TextToProtoFn(String sep, Class<M> msgClass) {
- this.sep = sep;
- this.msgClass = msgClass;
- }
-
- @Override
- public void initialize() {
- this.msgInstance = getDefaultInstance(msgClass);
- this.fields = msgInstance.getDescriptorForType().getFields();
- this.splitter = Splitter.on(sep);
- }
-
- @Override
- public void process(String input, Emitter<M> emitter) {
- if (input != null && !input.isEmpty()) {
- Builder b = msgInstance.newBuilderForType();
- Iterator<String> iter = splitter.split(input).iterator();
- boolean parseError = false;
- for (FieldDescriptor fd : fields) {
- if (iter.hasNext()) {
- String value = iter.next();
- if (value != null && !value.isEmpty()) {
- Object parsedValue = null;
- try {
- switch (fd.getJavaType()) {
- case STRING:
- parsedValue = value;
- break;
- case INT:
- parsedValue = Integer.valueOf(value);
- break;
- case LONG:
- parsedValue = Long.valueOf(value);
- break;
- case FLOAT:
- parsedValue = Float.valueOf(value);
- break;
- case DOUBLE:
- parsedValue = Double.valueOf(value);
- break;
- case BOOLEAN:
- parsedValue = Boolean.valueOf(value);
- break;
- case ENUM:
- parsedValue = fd.getEnumType().findValueByName(value);
- break;
- }
- b.setField(fd, parsedValue);
- } catch (NumberFormatException nfe) {
- increment(ParseErrors.NUMBER_FORMAT);
- parseError = true;
- break;
- }
- }
- }
- }
-
- if (parseError) {
- increment(ParseErrors.TOTAL);
- } else {
- emitter.emit((M) b.build());
- }
- }
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/TupleDeepCopier.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/TupleDeepCopier.java b/crunch/src/main/java/org/apache/crunch/types/TupleDeepCopier.java
deleted file mode 100644
index a2ffae3..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/TupleDeepCopier.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import java.util.List;
-
-import org.apache.crunch.Tuple;
-import org.apache.hadoop.conf.Configuration;
-
-import com.google.common.collect.Lists;
-
-/**
- * Performs deep copies (based on underlying PType deep copying) of Tuple-based objects.
- *
- * @param <T> The type of Tuple implementation being copied
- */
-public class TupleDeepCopier<T extends Tuple> implements DeepCopier<T> {
-
- private final TupleFactory<T> tupleFactory;
- private final List<PType> elementTypes;
-
- public TupleDeepCopier(Class<T> tupleClass, PType... elementTypes) {
- tupleFactory = TupleFactory.getTupleFactory(tupleClass);
- this.elementTypes = Lists.newArrayList(elementTypes);
- }
-
- @Override
- public void initialize(Configuration conf) {
- for (PType elementType : elementTypes) {
- elementType.initialize(conf);
- }
- }
-
- @Override
- public T deepCopy(T source) {
-
- if (source == null) {
- return null;
- }
-
- Object[] deepCopyValues = new Object[source.size()];
-
- for (int valueIndex = 0; valueIndex < elementTypes.size(); valueIndex++) {
- PType elementType = elementTypes.get(valueIndex);
- deepCopyValues[valueIndex] = elementType.getDetachedValue(source.get(valueIndex));
- }
-
- return tupleFactory.makeTuple(deepCopyValues);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/TupleFactory.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/TupleFactory.java b/crunch/src/main/java/org/apache/crunch/types/TupleFactory.java
deleted file mode 100644
index 73b47de..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/TupleFactory.java
+++ /dev/null
@@ -1,134 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import java.io.Serializable;
-import java.lang.reflect.Constructor;
-import java.util.Map;
-
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-
-import com.google.common.collect.Maps;
-
-public abstract class TupleFactory<T extends Tuple> implements Serializable {
-
- public void initialize() {
- }
-
- public abstract T makeTuple(Object... values);
-
-
- private static final Map<Class, TupleFactory> customTupleFactories = Maps.newHashMap();
-
- /**
- * Get the {@link TupleFactory} for a given Tuple implementation.
- *
- * @param tupleClass
- * The class for which the factory is to be retrieved
- * @return The appropriate TupleFactory
- */
- public static <T extends Tuple> TupleFactory<T> getTupleFactory(Class<T> tupleClass) {
- if (tupleClass == Pair.class) {
- return (TupleFactory<T>) PAIR;
- } else if (tupleClass == Tuple3.class) {
- return (TupleFactory<T>) TUPLE3;
- } else if (tupleClass == Tuple4.class) {
- return (TupleFactory<T>) TUPLE4;
- } else if (tupleClass == TupleN.class) {
- return (TupleFactory<T>) TUPLEN;
- } else if (customTupleFactories.containsKey(tupleClass)) {
- return (TupleFactory<T>) customTupleFactories.get(tupleClass);
- } else {
- throw new IllegalArgumentException("Can't create TupleFactory for " + tupleClass);
- }
- }
-
- public static final TupleFactory<Pair> PAIR = new TupleFactory<Pair>() {
- @Override
- public Pair makeTuple(Object... values) {
- return Pair.of(values[0], values[1]);
- }
- };
-
- public static final TupleFactory<Tuple3> TUPLE3 = new TupleFactory<Tuple3>() {
- @Override
- public Tuple3 makeTuple(Object... values) {
- return Tuple3.of(values[0], values[1], values[2]);
- }
- };
-
- public static final TupleFactory<Tuple4> TUPLE4 = new TupleFactory<Tuple4>() {
- @Override
- public Tuple4 makeTuple(Object... values) {
- return Tuple4.of(values[0], values[1], values[2], values[3]);
- }
- };
-
- public static final TupleFactory<TupleN> TUPLEN = new TupleFactory<TupleN>() {
- @Override
- public TupleN makeTuple(Object... values) {
- return new TupleN(values);
- }
- };
-
- public static <T extends Tuple> TupleFactory<T> create(Class<T> clazz, Class... typeArgs) {
- if (customTupleFactories.containsKey(clazz)) {
- return (TupleFactory<T>) customTupleFactories.get(clazz);
- }
- TupleFactory<T> custom = new CustomTupleFactory<T>(clazz, typeArgs);
- customTupleFactories.put(clazz, custom);
- return custom;
- }
-
- private static class CustomTupleFactory<T extends Tuple> extends TupleFactory<T> {
-
- private final Class<T> clazz;
- private final Class[] typeArgs;
-
- private transient Constructor<T> constructor;
-
- public CustomTupleFactory(Class<T> clazz, Class[] typeArgs) {
- this.clazz = clazz;
- this.typeArgs = typeArgs;
- }
-
- @Override
- public void initialize() {
- try {
- constructor = clazz.getConstructor(typeArgs);
- } catch (Exception e) {
- throw new CrunchRuntimeException(e);
- }
- }
-
- @Override
- public T makeTuple(Object... values) {
- try {
- return constructor.newInstance(values);
- } catch (Exception e) {
- throw new CrunchRuntimeException(e);
- }
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/AvroCapabilities.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/AvroCapabilities.java b/crunch/src/main/java/org/apache/crunch/types/avro/AvroCapabilities.java
deleted file mode 100644
index cc1636c..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/AvroCapabilities.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-
-import org.apache.avro.Schema;
-import org.apache.avro.io.BinaryDecoder;
-import org.apache.avro.io.BinaryEncoder;
-import org.apache.avro.io.DecoderFactory;
-import org.apache.avro.io.EncoderFactory;
-import org.apache.avro.reflect.ReflectDatumReader;
-import org.apache.avro.reflect.ReflectDatumWriter;
-
-import com.google.common.collect.Lists;
-
-/**
- * Determines the capabilities of the Avro version that is currently being used.
- */
-class AvroCapabilities {
-
- public static class Record extends org.apache.avro.specific.SpecificRecordBase implements
- org.apache.avro.specific.SpecificRecord {
- public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser()
- .parse("{\"type\":\"record\",\"name\":\"Record\",\"namespace\":\"org.apache.crunch.types.avro\",\"fields\":[{\"name\":\"subrecords\",\"type\":{\"type\":\"array\",\"items\":\"string\"}}]}");
- @Deprecated
- public java.util.List<java.lang.CharSequence> subrecords;
-
- public java.lang.Object get(int field$) {
- switch (field$) {
- case 0:
- return subrecords;
- default:
- throw new org.apache.avro.AvroRuntimeException("Bad index");
- }
- }
-
- // Used by DatumReader. Applications should not call.
- @SuppressWarnings(value = "unchecked")
- public void put(int field$, java.lang.Object value$) {
- switch (field$) {
- case 0:
- subrecords = (java.util.List<java.lang.CharSequence>) value$;
- break;
- default:
- throw new org.apache.avro.AvroRuntimeException("Bad index");
- }
- }
-
- @Override
- public Schema getSchema() {
- return SCHEMA$;
- }
- }
-
- /**
- * Determine if the current Avro version can use the ReflectDatumReader to
- * read SpecificData that includes an array. The inability to do this was a
- * bug that was fixed in Avro 1.7.0.
- *
- * @return true if SpecificData can be properly read using a
- * ReflectDatumReader
- */
- static boolean canDecodeSpecificSchemaWithReflectDatumReader() {
- ReflectDatumReader<Record> datumReader = new ReflectDatumReader(Record.SCHEMA$);
- ReflectDatumWriter<Record> datumWriter = new ReflectDatumWriter(Record.SCHEMA$);
-
- Record record = new Record();
- record.subrecords = Lists.<CharSequence> newArrayList("a", "b");
-
- ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
- BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(byteArrayOutputStream, null);
-
- try {
- datumWriter.write(record, encoder);
- encoder.flush();
- BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(
- byteArrayOutputStream.toByteArray(), null);
- datumReader.read(record, decoder);
- } catch (IOException ioe) {
- throw new RuntimeException("Error performing specific schema test", ioe);
- } catch (ClassCastException cce) {
- // This indicates that we're using a pre-1.7.0 version of Avro, as the
- // ReflectDatumReader in those versions could not correctly handle an
- // array in a SpecificData value
- return false;
- }
- return true;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/AvroDeepCopier.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/AvroDeepCopier.java b/crunch/src/main/java/org/apache/crunch/types/avro/AvroDeepCopier.java
deleted file mode 100644
index 0fe9288..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/AvroDeepCopier.java
+++ /dev/null
@@ -1,209 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import java.io.ByteArrayOutputStream;
-import java.io.Serializable;
-
-import org.apache.avro.Schema;
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.generic.GenericData.Record;
-import org.apache.avro.generic.GenericDatumReader;
-import org.apache.avro.generic.GenericDatumWriter;
-import org.apache.avro.io.BinaryDecoder;
-import org.apache.avro.io.BinaryEncoder;
-import org.apache.avro.io.DatumReader;
-import org.apache.avro.io.DatumWriter;
-import org.apache.avro.io.DecoderFactory;
-import org.apache.avro.io.EncoderFactory;
-import org.apache.avro.specific.SpecificDatumReader;
-import org.apache.avro.specific.SpecificDatumWriter;
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.types.DeepCopier;
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * Performs deep copies of Avro-serializable objects.
- * <p>
- * <b>Warning:</b> Methods in this class are not thread-safe. This shouldn't be a problem when
- * running in a map-reduce context where each mapper/reducer is running in its own JVM, but it may
- * well be a problem in any other kind of multi-threaded context.
- */
-abstract class AvroDeepCopier<T> implements DeepCopier<T>, Serializable {
-
- private String jsonSchema;
- private transient Configuration conf;
- private transient Schema schema;
- private BinaryEncoder binaryEncoder;
- private BinaryDecoder binaryDecoder;
-
- private transient DatumWriter<T> datumWriter;
- private transient DatumReader<T> datumReader;
-
- public AvroDeepCopier(Schema schema) {
- this.jsonSchema = schema.toString();
- }
-
- protected Schema getSchema() {
- if (schema == null) {
- schema = new Schema.Parser().parse(jsonSchema);
- }
- return schema;
- }
-
- @Override
- public void initialize(Configuration conf) {
- this.conf = conf;
- }
-
- protected abstract T createCopyTarget();
-
- protected abstract DatumWriter<T> createDatumWriter(Configuration conf);
-
- protected abstract DatumReader<T> createDatumReader(Configuration conf);
-
- /**
- * Deep copier for Avro specific data objects.
- */
- public static class AvroSpecificDeepCopier<T> extends AvroDeepCopier<T> {
-
- private Class<T> valueClass;
-
- public AvroSpecificDeepCopier(Class<T> valueClass, Schema schema) {
- super(schema);
- this.valueClass = valueClass;
- }
-
- @Override
- protected T createCopyTarget() {
- return createNewInstance(valueClass);
- }
-
- @Override
- protected DatumWriter<T> createDatumWriter(Configuration conf) {
- return new SpecificDatumWriter<T>(getSchema());
- }
-
- @Override
- protected DatumReader<T> createDatumReader(Configuration conf) {
- return new SpecificDatumReader<T>(getSchema());
- }
-
- }
-
- /**
- * Deep copier for Avro generic data objects.
- */
- public static class AvroGenericDeepCopier extends AvroDeepCopier<Record> {
-
- private transient Schema schema;
-
- public AvroGenericDeepCopier(Schema schema) {
- super(schema);
- }
-
- @Override
- protected Record createCopyTarget() {
- return new GenericData.Record(getSchema());
- }
-
- @Override
- protected DatumReader<Record> createDatumReader(Configuration conf) {
- return new GenericDatumReader<Record>(getSchema());
- }
-
- @Override
- protected DatumWriter<Record> createDatumWriter(Configuration conf) {
- return new GenericDatumWriter<Record>(getSchema());
- }
- }
-
- /**
- * Deep copier for Avro reflect data objects.
- */
- public static class AvroReflectDeepCopier<T> extends AvroDeepCopier<T> {
-
- private Class<T> valueClass;
-
- public AvroReflectDeepCopier(Class<T> valueClass, Schema schema) {
- super(schema);
- this.valueClass = valueClass;
- }
-
- @Override
- protected T createCopyTarget() {
- return createNewInstance(valueClass);
- }
-
- @Override
- protected DatumReader<T> createDatumReader(Configuration conf) {
- return Avros.getReflectDataFactory(conf).getReader(getSchema());
- }
-
- @Override
- protected DatumWriter<T> createDatumWriter(Configuration conf) {
- return Avros.getReflectDataFactory(conf).getWriter(getSchema());
- }
- }
-
- /**
- * Create a deep copy of an Avro value.
- *
- * @param source The value to be copied
- * @return The deep copy of the value
- */
- @Override
- public T deepCopy(T source) {
-
- if (source == null) {
- return null;
- }
-
- if (datumReader == null) {
- datumReader = createDatumReader(conf);
- }
- if (datumWriter == null) {
- datumWriter = createDatumWriter(conf);
- }
- ByteArrayOutputStream byteOutStream = new ByteArrayOutputStream();
- binaryEncoder = EncoderFactory.get().binaryEncoder(byteOutStream, binaryEncoder);
- T target = createCopyTarget();
- try {
- datumWriter.write(source, binaryEncoder);
- binaryEncoder.flush();
- binaryDecoder = DecoderFactory.get()
- .binaryDecoder(byteOutStream.toByteArray(), binaryDecoder);
- datumReader.read(target, binaryDecoder);
- } catch (Exception e) {
- throw new CrunchRuntimeException("Error while deep copying avro value " + source, e);
- }
-
- return target;
- }
-
- protected T createNewInstance(Class<T> targetClass) {
- try {
- return targetClass.newInstance();
- } catch (InstantiationException e) {
- throw new CrunchRuntimeException(e);
- } catch (IllegalAccessException e) {
- throw new CrunchRuntimeException(e);
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/AvroGroupedTableType.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/AvroGroupedTableType.java b/crunch/src/main/java/org/apache/crunch/types/avro/AvroGroupedTableType.java
deleted file mode 100644
index 598868f..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/AvroGroupedTableType.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import java.util.Collection;
-
-import org.apache.avro.mapred.AvroJob;
-import org.apache.avro.mapred.AvroKey;
-import org.apache.avro.mapred.AvroKeyComparator;
-import org.apache.avro.mapred.AvroValue;
-import org.apache.crunch.GroupingOptions;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Pair;
-import org.apache.crunch.fn.PairMapFn;
-import org.apache.crunch.lib.PTables;
-import org.apache.crunch.types.Converter;
-import org.apache.crunch.types.PGroupedTableType;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.Job;
-
-/**
- *
- *
- */
-class AvroGroupedTableType<K, V> extends PGroupedTableType<K, V> {
-
- private static final AvroPairConverter CONVERTER = new AvroPairConverter();
- private final MapFn inputFn;
- private final MapFn outputFn;
-
- public AvroGroupedTableType(AvroTableType<K, V> tableType) {
- super(tableType);
- AvroType keyType = (AvroType) tableType.getKeyType();
- AvroType valueType = (AvroType) tableType.getValueType();
- this.inputFn = new PairIterableMapFn(keyType.getInputMapFn(), valueType.getInputMapFn());
- this.outputFn = new PairMapFn(keyType.getOutputMapFn(), valueType.getOutputMapFn());
- }
-
- @Override
- public Class<Pair<K, Iterable<V>>> getTypeClass() {
- return (Class<Pair<K, Iterable<V>>>) Pair.of(null, null).getClass();
- }
-
- @Override
- public Converter getGroupingConverter() {
- return CONVERTER;
- }
-
- @Override
- public MapFn getInputMapFn() {
- return inputFn;
- }
-
- @Override
- public MapFn getOutputMapFn() {
- return outputFn;
- }
-
- @Override
- public void initialize(Configuration conf) {
- getTableType().initialize(conf);
- }
-
- @Override
- public Pair<K, Iterable<V>> getDetachedValue(Pair<K, Iterable<V>> value) {
- return PTables.getGroupedDetachedValue(this, value);
- }
-
- @Override
- public void configureShuffle(Job job, GroupingOptions options) {
- AvroTableType<K, V> att = (AvroTableType<K, V>) tableType;
- String schemaJson = att.getSchema().toString();
- Configuration conf = job.getConfiguration();
-
- if (att.hasReflect()) {
- if (att.hasSpecific()) {
- Avros.checkCombiningSpecificAndReflectionSchemas();
- }
- conf.setBoolean(AvroJob.MAP_OUTPUT_IS_REFLECT, true);
- }
- conf.set(AvroJob.MAP_OUTPUT_SCHEMA, schemaJson);
- job.setSortComparatorClass(AvroKeyComparator.class);
- job.setMapOutputKeyClass(AvroKey.class);
- job.setMapOutputValueClass(AvroValue.class);
- if (options != null) {
- options.configure(job);
- }
-
- Avros.configureReflectDataFactory(conf);
-
- Collection<String> serializations = job.getConfiguration().getStringCollection(
- "io.serializations");
- if (!serializations.contains(SafeAvroSerialization.class.getName())) {
- serializations.add(SafeAvroSerialization.class.getName());
- job.getConfiguration().setStrings("io.serializations", serializations.toArray(new String[0]));
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/AvroInputFormat.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/AvroInputFormat.java b/crunch/src/main/java/org/apache/crunch/types/avro/AvroInputFormat.java
deleted file mode 100644
index b8bbebd..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/AvroInputFormat.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import java.io.IOException;
-
-import org.apache.avro.Schema;
-import org.apache.avro.mapred.AvroJob;
-import org.apache.avro.mapred.AvroWrapper;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-
-/** An {@link org.apache.hadoop.mapreduce.InputFormat} for Avro data files. */
-public class AvroInputFormat<T> extends FileInputFormat<AvroWrapper<T>, NullWritable> {
- @Override
- public RecordReader<AvroWrapper<T>, NullWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
- throws IOException, InterruptedException {
- context.setStatus(split.toString());
- String jsonSchema = context.getConfiguration().get(AvroJob.INPUT_SCHEMA);
- Schema schema = new Schema.Parser().parse(jsonSchema);
- return new AvroRecordReader<T>(schema);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/AvroKeyConverter.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/AvroKeyConverter.java b/crunch/src/main/java/org/apache/crunch/types/avro/AvroKeyConverter.java
deleted file mode 100644
index 68b717d..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/AvroKeyConverter.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import org.apache.avro.mapred.AvroWrapper;
-import org.apache.crunch.types.Converter;
-import org.apache.hadoop.io.NullWritable;
-
-class AvroKeyConverter<K> implements Converter<AvroWrapper<K>, NullWritable, K, Iterable<K>> {
-
- private transient AvroWrapper<K> wrapper = null;
-
- @Override
- public K convertInput(AvroWrapper<K> key, NullWritable value) {
- return key.datum();
- }
-
- @Override
- public AvroWrapper<K> outputKey(K value) {
- getWrapper().datum(value);
- return wrapper;
- }
-
- @Override
- public NullWritable outputValue(K value) {
- return NullWritable.get();
- }
-
- @Override
- public Class<AvroWrapper<K>> getKeyClass() {
- return (Class<AvroWrapper<K>>) getWrapper().getClass();
- }
-
- @Override
- public Class<NullWritable> getValueClass() {
- return NullWritable.class;
- }
-
- private AvroWrapper<K> getWrapper() {
- if (wrapper == null) {
- wrapper = new AvroWrapper<K>();
- }
- return wrapper;
- }
-
- @Override
- public Iterable<K> convertIterableInput(AvroWrapper<K> key, Iterable<NullWritable> value) {
- throw new UnsupportedOperationException("Should not be possible");
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/AvroOutputFormat.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/AvroOutputFormat.java b/crunch/src/main/java/org/apache/crunch/types/avro/AvroOutputFormat.java
deleted file mode 100644
index 98d3f50..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/AvroOutputFormat.java
+++ /dev/null
@@ -1,87 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import java.io.IOException;
-
-import org.apache.avro.Schema;
-import org.apache.avro.file.CodecFactory;
-import org.apache.avro.file.DataFileWriter;
-import org.apache.avro.mapred.AvroJob;
-import org.apache.avro.mapred.AvroWrapper;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-
-/** An {@link org.apache.hadoop.mapreduce.OutputFormat} for Avro data files. */
-public class AvroOutputFormat<T> extends FileOutputFormat<AvroWrapper<T>, NullWritable> {
-
- @Override
- public RecordWriter<AvroWrapper<T>, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException,
- InterruptedException {
-
- Configuration conf = context.getConfiguration();
- Schema schema = null;
- String outputName = conf.get("crunch.namedoutput");
- if (outputName != null && !outputName.isEmpty()) {
- schema = (new Schema.Parser()).parse(conf.get("avro.output.schema." + outputName));
- } else {
- schema = AvroJob.getOutputSchema(context.getConfiguration());
- }
-
- ReflectDataFactory factory = Avros.getReflectDataFactory(conf);
- final DataFileWriter<T> WRITER = new DataFileWriter<T>(factory.<T> getWriter(schema));
-
- JobConf jc = new JobConf(conf);
- /* copied from org.apache.avro.mapred.AvroOutputFormat */
-
- if (org.apache.hadoop.mapred.FileOutputFormat.getCompressOutput(jc)) {
- int level = conf.getInt(org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY,
- org.apache.avro.mapred.AvroOutputFormat.DEFAULT_DEFLATE_LEVEL);
- String codecName = conf.get(AvroJob.OUTPUT_CODEC,
- org.apache.avro.file.DataFileConstants.DEFLATE_CODEC);
- CodecFactory codec = codecName.equals(org.apache.avro.file.DataFileConstants.DEFLATE_CODEC)
- ? CodecFactory.deflateCodec(level)
- : CodecFactory.fromString(codecName);
- WRITER.setCodec(codec);
- }
-
- WRITER.setSyncInterval(jc.getInt(org.apache.avro.mapred.AvroOutputFormat.SYNC_INTERVAL_KEY,
- org.apache.avro.file.DataFileConstants.DEFAULT_SYNC_INTERVAL));
-
- Path path = getDefaultWorkFile(context, org.apache.avro.mapred.AvroOutputFormat.EXT);
- WRITER.create(schema, path.getFileSystem(context.getConfiguration()).create(path));
-
- return new RecordWriter<AvroWrapper<T>, NullWritable>() {
- @Override
- public void write(AvroWrapper<T> wrapper, NullWritable ignore) throws IOException {
- WRITER.append(wrapper.datum());
- }
-
- @Override
- public void close(TaskAttemptContext context) throws IOException, InterruptedException {
- WRITER.close();
- }
- };
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/AvroPairConverter.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/AvroPairConverter.java b/crunch/src/main/java/org/apache/crunch/types/avro/AvroPairConverter.java
deleted file mode 100644
index d1d2627..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/AvroPairConverter.java
+++ /dev/null
@@ -1,108 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import java.util.Iterator;
-
-import org.apache.avro.mapred.AvroKey;
-import org.apache.avro.mapred.AvroValue;
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.Converter;
-
-class AvroPairConverter<K, V> implements Converter<AvroKey<K>, AvroValue<V>, Pair<K, V>, Pair<K, Iterable<V>>> {
-
- private transient AvroKey<K> keyWrapper = null;
- private transient AvroValue<V> valueWrapper = null;
-
- @Override
- public Pair<K, V> convertInput(AvroKey<K> key, AvroValue<V> value) {
- return Pair.of(key.datum(), value.datum());
- }
-
- public Pair<K, Iterable<V>> convertIterableInput(AvroKey<K> key, Iterable<AvroValue<V>> iter) {
- Iterable<V> it = new AvroWrappedIterable<V>(iter);
- return Pair.of(key.datum(), it);
- }
-
- @Override
- public AvroKey<K> outputKey(Pair<K, V> value) {
- getKeyWrapper().datum(value.first());
- return keyWrapper;
- }
-
- @Override
- public AvroValue<V> outputValue(Pair<K, V> value) {
- getValueWrapper().datum(value.second());
- return valueWrapper;
- }
-
- @Override
- public Class<AvroKey<K>> getKeyClass() {
- return (Class<AvroKey<K>>) getKeyWrapper().getClass();
- }
-
- @Override
- public Class<AvroValue<V>> getValueClass() {
- return (Class<AvroValue<V>>) getValueWrapper().getClass();
- }
-
- private AvroKey<K> getKeyWrapper() {
- if (keyWrapper == null) {
- keyWrapper = new AvroKey<K>();
- }
- return keyWrapper;
- }
-
- private AvroValue<V> getValueWrapper() {
- if (valueWrapper == null) {
- valueWrapper = new AvroValue<V>();
- }
- return valueWrapper;
- }
-
- private static class AvroWrappedIterable<V> implements Iterable<V> {
-
- private final Iterable<AvroValue<V>> iters;
-
- public AvroWrappedIterable(Iterable<AvroValue<V>> iters) {
- this.iters = iters;
- }
-
- @Override
- public Iterator<V> iterator() {
- return new Iterator<V>() {
- private final Iterator<AvroValue<V>> it = iters.iterator();
-
- @Override
- public boolean hasNext() {
- return it.hasNext();
- }
-
- @Override
- public V next() {
- return it.next().datum();
- }
-
- @Override
- public void remove() {
- it.remove();
- }
- };
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/AvroRecordReader.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/AvroRecordReader.java b/crunch/src/main/java/org/apache/crunch/types/avro/AvroRecordReader.java
deleted file mode 100644
index 9c7578c..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/AvroRecordReader.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import java.io.IOException;
-
-import org.apache.avro.Schema;
-import org.apache.avro.file.DataFileReader;
-import org.apache.avro.file.FileReader;
-import org.apache.avro.file.SeekableInput;
-import org.apache.avro.io.DatumReader;
-import org.apache.avro.mapred.AvroJob;
-import org.apache.avro.mapred.AvroWrapper;
-import org.apache.avro.mapred.FsInput;
-import org.apache.avro.specific.SpecificDatumReader;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-
-/** An {@link RecordReader} for Avro data files. */
-class AvroRecordReader<T> extends RecordReader<AvroWrapper<T>, NullWritable> {
-
- private FileReader<T> reader;
- private long start;
- private long end;
- private AvroWrapper<T> key;
- private NullWritable value;
- private Schema schema;
-
- public AvroRecordReader(Schema schema) {
- this.schema = schema;
- }
-
- @Override
- public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException {
- FileSplit split = (FileSplit) genericSplit;
- Configuration conf = context.getConfiguration();
- SeekableInput in = new FsInput(split.getPath(), conf);
- DatumReader<T> datumReader = null;
- if (context.getConfiguration().getBoolean(AvroJob.INPUT_IS_REFLECT, true)) {
- ReflectDataFactory factory = Avros.getReflectDataFactory(conf);
- datumReader = factory.getReader(schema);
- } else {
- datumReader = new SpecificDatumReader<T>(schema);
- }
- this.reader = DataFileReader.openReader(in, datumReader);
- reader.sync(split.getStart()); // sync to start
- this.start = reader.tell();
- this.end = split.getStart() + split.getLength();
- }
-
- @Override
- public boolean nextKeyValue() throws IOException, InterruptedException {
- if (!reader.hasNext() || reader.pastSync(end)) {
- key = null;
- value = null;
- return false;
- }
- if (key == null) {
- key = new AvroWrapper<T>();
- }
- if (value == null) {
- value = NullWritable.get();
- }
- key.datum(reader.next(key.datum()));
- return true;
- }
-
- @Override
- public AvroWrapper<T> getCurrentKey() throws IOException, InterruptedException {
- return key;
- }
-
- @Override
- public NullWritable getCurrentValue() throws IOException, InterruptedException {
- return value;
- }
-
- @Override
- public float getProgress() throws IOException {
- if (end == start) {
- return 0.0f;
- } else {
- return Math.min(1.0f, (getPos() - start) / (float) (end - start));
- }
- }
-
- public long getPos() throws IOException {
- return reader.tell();
- }
-
- @Override
- public void close() throws IOException {
- reader.close();
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/AvroTableType.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/AvroTableType.java b/crunch/src/main/java/org/apache/crunch/types/avro/AvroTableType.java
deleted file mode 100644
index 86613df..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/AvroTableType.java
+++ /dev/null
@@ -1,151 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import org.apache.avro.Schema;
-import org.apache.avro.generic.IndexedRecord;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Pair;
-import org.apache.crunch.lib.PTables;
-import org.apache.crunch.types.PGroupedTableType;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.TupleDeepCopier;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-
-/**
- * The implementation of the PTableType interface for Avro-based serialization.
- *
- */
-class AvroTableType<K, V> extends AvroType<Pair<K, V>> implements PTableType<K, V> {
-
- private static class PairToAvroPair extends MapFn<Pair, org.apache.avro.mapred.Pair> {
- private final MapFn keyMapFn;
- private final MapFn valueMapFn;
- private final String firstJson;
- private final String secondJson;
-
- private String pairSchemaJson;
- private transient Schema pairSchema;
-
- public PairToAvroPair(AvroType keyType, AvroType valueType) {
- this.keyMapFn = keyType.getOutputMapFn();
- this.firstJson = keyType.getSchema().toString();
- this.valueMapFn = valueType.getOutputMapFn();
- this.secondJson = valueType.getSchema().toString();
- }
-
- @Override
- public void configure(Configuration conf) {
- keyMapFn.configure(conf);
- valueMapFn.configure(conf);
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- keyMapFn.setContext(context);
- valueMapFn.setContext(context);
- }
-
- @Override
- public void initialize() {
- keyMapFn.initialize();
- valueMapFn.initialize();
- pairSchemaJson = org.apache.avro.mapred.Pair.getPairSchema(
- new Schema.Parser().parse(firstJson), new Schema.Parser().parse(secondJson)).toString();
- }
-
- @Override
- public org.apache.avro.mapred.Pair map(Pair input) {
- if (pairSchema == null) {
- pairSchema = new Schema.Parser().parse(pairSchemaJson);
- }
- org.apache.avro.mapred.Pair avroPair = new org.apache.avro.mapred.Pair(pairSchema);
- avroPair.key(keyMapFn.map(input.first()));
- avroPair.value(valueMapFn.map(input.second()));
- return avroPair;
- }
- }
-
- private static class IndexedRecordToPair extends MapFn<IndexedRecord, Pair> {
-
- private final MapFn firstMapFn;
- private final MapFn secondMapFn;
-
- public IndexedRecordToPair(MapFn firstMapFn, MapFn secondMapFn) {
- this.firstMapFn = firstMapFn;
- this.secondMapFn = secondMapFn;
- }
-
- @Override
- public void configure(Configuration conf) {
- firstMapFn.configure(conf);
- secondMapFn.configure(conf);
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- firstMapFn.setContext(context);
- secondMapFn.setContext(context);
- }
-
- @Override
- public void initialize() {
- firstMapFn.initialize();
- secondMapFn.initialize();
- }
-
- @Override
- public Pair map(IndexedRecord input) {
- return Pair.of(firstMapFn.map(input.get(0)), secondMapFn.map(input.get(1)));
- }
- }
-
- private final AvroType<K> keyType;
- private final AvroType<V> valueType;
-
- public AvroTableType(AvroType<K> keyType, AvroType<V> valueType, Class<Pair<K, V>> pairClass) {
- super(pairClass, org.apache.avro.mapred.Pair.getPairSchema(keyType.getSchema(),
- valueType.getSchema()), new IndexedRecordToPair(keyType.getInputMapFn(),
- valueType.getInputMapFn()), new PairToAvroPair(keyType, valueType), new TupleDeepCopier(
- Pair.class, keyType, valueType), keyType, valueType);
- this.keyType = keyType;
- this.valueType = valueType;
- }
-
- @Override
- public PType<K> getKeyType() {
- return keyType;
- }
-
- @Override
- public PType<V> getValueType() {
- return valueType;
- }
-
- @Override
- public PGroupedTableType<K, V> getGroupedTableType() {
- return new AvroGroupedTableType<K, V>(this);
- }
-
- @Override
- public Pair<K, V> getDetachedValue(Pair<K, V> value) {
- return PTables.getDetachedValue(this, value);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/AvroTextOutputFormat.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/AvroTextOutputFormat.java b/crunch/src/main/java/org/apache/crunch/types/avro/AvroTextOutputFormat.java
deleted file mode 100644
index 4930235..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/AvroTextOutputFormat.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import java.io.IOException;
-
-import org.apache.avro.mapred.AvroWrapper;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-
-public class AvroTextOutputFormat<K, V> extends TextOutputFormat<K, V> {
- class DatumRecordTextWriter extends RecordWriter<K, V> {
- private RecordWriter lineRecordWriter;
-
- public DatumRecordTextWriter(RecordWriter recordWriter) {
- this.lineRecordWriter = recordWriter;
- }
-
- @Override
- public void close(TaskAttemptContext context) throws IOException, InterruptedException {
- lineRecordWriter.close(context);
- }
-
- @Override
- public void write(K arg0, V arg1) throws IOException, InterruptedException {
- lineRecordWriter.write(getData(arg0), getData(arg1));
- }
-
- private Object getData(Object o) {
- Object data = o;
- if (o instanceof AvroWrapper) {
- data = ((AvroWrapper) o).datum();
- }
- return data;
- }
- }
-
- @Override
- public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
- RecordWriter<K, V> recordWriter = super.getRecordWriter(context);
- return new DatumRecordTextWriter(recordWriter);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/AvroType.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/AvroType.java b/crunch/src/main/java/org/apache/crunch/types/avro/AvroType.java
deleted file mode 100644
index a92b0d0..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/AvroType.java
+++ /dev/null
@@ -1,199 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import java.util.List;
-
-import org.apache.avro.Schema;
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.specific.SpecificRecord;
-import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.fn.IdentityFn;
-import org.apache.crunch.io.ReadableSourceTarget;
-import org.apache.crunch.io.avro.AvroFileSourceTarget;
-import org.apache.crunch.types.Converter;
-import org.apache.crunch.types.DeepCopier;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-
-import com.google.common.base.Preconditions;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-
-/**
- * The implementation of the PType interface for Avro-based serialization.
- *
- */
-public class AvroType<T> implements PType<T> {
-
- private static final Converter AVRO_CONVERTER = new AvroKeyConverter();
-
- private final Class<T> typeClass;
- private final String schemaString;
- private transient Schema schema;
- private final MapFn baseInputMapFn;
- private final MapFn baseOutputMapFn;
- private final List<PType> subTypes;
- private DeepCopier<T> deepCopier;
- private boolean initialized = false;
-
- public AvroType(Class<T> typeClass, Schema schema, DeepCopier<T> deepCopier, PType... ptypes) {
- this(typeClass, schema, IdentityFn.getInstance(), IdentityFn.getInstance(), deepCopier, ptypes);
- }
-
- public AvroType(Class<T> typeClass, Schema schema, MapFn inputMapFn, MapFn outputMapFn,
- DeepCopier<T> deepCopier, PType... ptypes) {
- this.typeClass = typeClass;
- this.schema = Preconditions.checkNotNull(schema);
- this.schemaString = schema.toString();
- this.baseInputMapFn = inputMapFn;
- this.baseOutputMapFn = outputMapFn;
- this.deepCopier = deepCopier;
- this.subTypes = ImmutableList.<PType> builder().add(ptypes).build();
- }
-
- @Override
- public Class<T> getTypeClass() {
- return typeClass;
- }
-
- @Override
- public PTypeFamily getFamily() {
- return AvroTypeFamily.getInstance();
- }
-
- @Override
- public List<PType> getSubTypes() {
- return Lists.<PType> newArrayList(subTypes);
- }
-
- public Schema getSchema() {
- if (schema == null) {
- schema = new Schema.Parser().parse(schemaString);
- }
- return schema;
- }
-
- /**
- * Determine if the wrapped type is a specific data avro type or wraps one.
- *
- * @return true if the wrapped type is a specific data type or wraps one
- */
- public boolean hasSpecific() {
- if (Avros.isPrimitive(this)) {
- return false;
- }
-
- if (!this.subTypes.isEmpty()) {
- for (PType<?> subType : this.subTypes) {
- AvroType<?> atype = (AvroType<?>) subType;
- if (atype.hasSpecific()) {
- return true;
- }
- }
- return false;
- }
-
- return SpecificRecord.class.isAssignableFrom(typeClass);
- }
-
- /**
- * Determine if the wrapped type is a generic data avro type.
- *
- * @return true if the wrapped type is a generic type
- */
- public boolean isGeneric() {
- return GenericData.Record.class.equals(typeClass);
- }
-
- /**
- * Determine if the wrapped type is a reflection-based avro type or wraps one.
- *
- * @return true if the wrapped type is a reflection-based type or wraps one.
- */
- public boolean hasReflect() {
- if (Avros.isPrimitive(this)) {
- return false;
- }
-
- if (!this.subTypes.isEmpty()) {
- for (PType<?> subType : this.subTypes) {
- if (((AvroType<?>) subType).hasReflect()) {
- return true;
- }
- }
- return false;
- }
-
- return !(typeClass.equals(GenericData.Record.class) || SpecificRecord.class
- .isAssignableFrom(typeClass));
- }
-
- public MapFn<Object, T> getInputMapFn() {
- return baseInputMapFn;
- }
-
- public MapFn<T, Object> getOutputMapFn() {
- return baseOutputMapFn;
- }
-
- @Override
- public Converter getConverter() {
- return AVRO_CONVERTER;
- }
-
- @Override
- public ReadableSourceTarget<T> getDefaultFileSource(Path path) {
- return new AvroFileSourceTarget<T>(path, this);
- }
-
- @Override
- public void initialize(Configuration conf) {
- deepCopier.initialize(conf);
- initialized = true;
- }
-
- @Override
- public T getDetachedValue(T value) {
- if (!initialized) {
- throw new IllegalStateException("Cannot call getDetachedValue on an uninitialized PType");
- }
- return deepCopier.deepCopy(value);
- }
-
- @Override
- public boolean equals(Object other) {
- if (other == null || !(other instanceof AvroType)) {
- return false;
- }
- AvroType at = (AvroType) other;
- return (typeClass.equals(at.typeClass) && subTypes.equals(at.subTypes));
-
- }
-
- @Override
- public int hashCode() {
- HashCodeBuilder hcb = new HashCodeBuilder();
- hcb.append(typeClass).append(subTypes);
- return hcb.toHashCode();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/AvroTypeFamily.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/AvroTypeFamily.java b/crunch/src/main/java/org/apache/crunch/types/avro/AvroTypeFamily.java
deleted file mode 100644
index e09e173..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/AvroTypeFamily.java
+++ /dev/null
@@ -1,164 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.Map;
-
-import org.apache.avro.Schema;
-import org.apache.avro.generic.GenericData;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-import org.apache.crunch.types.PGroupedTableType;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.PTypeUtils;
-
-public class AvroTypeFamily implements PTypeFamily {
-
- private static final AvroTypeFamily INSTANCE = new AvroTypeFamily();
-
- public static AvroTypeFamily getInstance() {
- return INSTANCE;
- }
-
- // There can only be one instance.
- private AvroTypeFamily() {
- }
-
- @Override
- public PType<Void> nulls() {
- return Avros.nulls();
- }
-
- @Override
- public PType<String> strings() {
- return Avros.strings();
- }
-
- @Override
- public PType<Long> longs() {
- return Avros.longs();
- }
-
- @Override
- public PType<Integer> ints() {
- return Avros.ints();
- }
-
- @Override
- public PType<Float> floats() {
- return Avros.floats();
- }
-
- @Override
- public PType<Double> doubles() {
- return Avros.doubles();
- }
-
- @Override
- public PType<Boolean> booleans() {
- return Avros.booleans();
- }
-
- @Override
- public PType<ByteBuffer> bytes() {
- return Avros.bytes();
- }
-
- @Override
- public <T> PType<T> records(Class<T> clazz) {
- return Avros.records(clazz);
- }
-
- public PType<GenericData.Record> generics(Schema schema) {
- return Avros.generics(schema);
- }
-
- public <T> PType<T> containers(Class<T> clazz) {
- return Avros.containers(clazz);
- }
-
- @Override
- public <T> PType<Collection<T>> collections(PType<T> ptype) {
- return Avros.collections(ptype);
- }
-
- @Override
- public <T> PType<Map<String, T>> maps(PType<T> ptype) {
- return Avros.maps(ptype);
- }
-
- @Override
- public <V1, V2> PType<Pair<V1, V2>> pairs(PType<V1> p1, PType<V2> p2) {
- return Avros.pairs(p1, p2);
- }
-
- @Override
- public <V1, V2, V3> PType<Tuple3<V1, V2, V3>> triples(PType<V1> p1, PType<V2> p2, PType<V3> p3) {
- return Avros.triples(p1, p2, p3);
- }
-
- @Override
- public <V1, V2, V3, V4> PType<Tuple4<V1, V2, V3, V4>> quads(PType<V1> p1, PType<V2> p2, PType<V3> p3, PType<V4> p4) {
- return Avros.quads(p1, p2, p3, p4);
- }
-
- @Override
- public PType<TupleN> tuples(PType<?>... ptypes) {
- return Avros.tuples(ptypes);
- }
-
- @Override
- public <K, V> PTableType<K, V> tableOf(PType<K> key, PType<V> value) {
- return Avros.tableOf(key, value);
- }
-
- @Override
- public <T> PType<T> as(PType<T> ptype) {
- if (ptype instanceof AvroType || ptype instanceof AvroGroupedTableType) {
- return ptype;
- }
- if (ptype instanceof PGroupedTableType) {
- PTableType ptt = ((PGroupedTableType) ptype).getTableType();
- return new AvroGroupedTableType((AvroTableType) as(ptt));
- }
- Class<T> typeClass = ptype.getTypeClass();
- PType<T> prim = Avros.getPrimitiveType(typeClass);
- if (prim != null) {
- return prim;
- }
- return PTypeUtils.convert(ptype, this);
- }
-
- @Override
- public <T extends Tuple> PType<T> tuples(Class<T> clazz, PType<?>... ptypes) {
- return Avros.tuples(clazz, ptypes);
- }
-
- @Override
- public <S, T> PType<T> derived(Class<T> clazz, MapFn<S, T> inputFn, MapFn<T, S> outputFn, PType<S> base) {
- return Avros.derived(clazz, inputFn, outputFn, base);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/avro/AvroUtf8InputFormat.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/avro/AvroUtf8InputFormat.java b/crunch/src/main/java/org/apache/crunch/types/avro/AvroUtf8InputFormat.java
deleted file mode 100644
index 9460fa5..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/avro/AvroUtf8InputFormat.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types.avro;
-
-import java.io.IOException;
-
-import org.apache.avro.mapred.AvroWrapper;
-import org.apache.avro.util.Utf8;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.compress.CompressionCodecFactory;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
-
-/**
- * An {@link org.apache.hadoop.mapred.InputFormat} for text files. Each line is
- * a {@link Utf8} key; values are null.
- */
-public class AvroUtf8InputFormat extends FileInputFormat<AvroWrapper<Utf8>, NullWritable> {
-
- static class Utf8LineRecordReader extends RecordReader<AvroWrapper<Utf8>, NullWritable> {
-
- private LineRecordReader lineRecordReader;
-
- private AvroWrapper<Utf8> currentKey = new AvroWrapper<Utf8>();
-
- public Utf8LineRecordReader() throws IOException {
- this.lineRecordReader = new LineRecordReader();
- }
-
- public void close() throws IOException {
- lineRecordReader.close();
- }
-
- public float getProgress() throws IOException {
- return lineRecordReader.getProgress();
- }
-
- @Override
- public AvroWrapper<Utf8> getCurrentKey() throws IOException, InterruptedException {
- Text txt = lineRecordReader.getCurrentValue();
- currentKey.datum(new Utf8(txt.toString()));
- return currentKey;
- }
-
- @Override
- public NullWritable getCurrentValue() throws IOException, InterruptedException {
- return NullWritable.get();
- }
-
- @Override
- public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
- lineRecordReader.initialize(split, context);
- }
-
- @Override
- public boolean nextKeyValue() throws IOException, InterruptedException {
- return lineRecordReader.nextKeyValue();
- }
- }
-
- private CompressionCodecFactory compressionCodecs = null;
-
- public void configure(Configuration conf) {
- compressionCodecs = new CompressionCodecFactory(conf);
- }
-
- protected boolean isSplitable(FileSystem fs, Path file) {
- return compressionCodecs.getCodec(file) == null;
- }
-
- @Override
- public RecordReader<AvroWrapper<Utf8>, NullWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
- throws IOException, InterruptedException {
- return new Utf8LineRecordReader();
- }
-}
[31/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/CrunchOutputs.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/CrunchOutputs.java b/crunch-core/src/main/java/org/apache/crunch/io/CrunchOutputs.java
new file mode 100644
index 0000000..ccf4fb5
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/CrunchOutputs.java
@@ -0,0 +1,184 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.hadoop.mapreduce.TaskAttemptContextFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * An analogue of {@link CrunchInputs} for handling multiple {@code OutputFormat} instances
+ * writing to multiple files within a single MapReduce job.
+ */
+public class CrunchOutputs<K, V> {
+ public static final String CRUNCH_OUTPUTS = "crunch.outputs.dir";
+
+ private static final char RECORD_SEP = ',';
+ private static final char FIELD_SEP = ';';
+ private static final Joiner JOINER = Joiner.on(FIELD_SEP);
+ private static final Splitter SPLITTER = Splitter.on(FIELD_SEP);
+
+ public static void addNamedOutput(Job job, String name,
+ Class<? extends OutputFormat> outputFormatClass,
+ Class keyClass, Class valueClass) {
+ addNamedOutput(job, name, FormatBundle.forOutput(outputFormatClass), keyClass, valueClass);
+ }
+
+ public static void addNamedOutput(Job job, String name,
+ FormatBundle<? extends OutputFormat> outputBundle,
+ Class keyClass, Class valueClass) {
+ Configuration conf = job.getConfiguration();
+ String inputs = JOINER.join(name, outputBundle.serialize(), keyClass.getName(), valueClass.getName());
+ String existing = conf.get(CRUNCH_OUTPUTS);
+ conf.set(CRUNCH_OUTPUTS, existing == null ? inputs : existing + RECORD_SEP + inputs);
+ }
+
+ private static class OutputConfig<K, V> {
+ public FormatBundle<OutputFormat<K, V>> bundle;
+ public Class<K> keyClass;
+ public Class<V> valueClass;
+
+ public OutputConfig(FormatBundle<OutputFormat<K, V>> bundle,
+ Class<K> keyClass, Class<V> valueClass) {
+ this.bundle = bundle;
+ this.keyClass = keyClass;
+ this.valueClass = valueClass;
+ }
+ }
+
+ private static Map<String, OutputConfig> getNamedOutputs(
+ TaskInputOutputContext<?, ?, ?, ?> context) {
+ Map<String, OutputConfig> out = Maps.newHashMap();
+ Configuration conf = context.getConfiguration();
+ for (String input : Splitter.on(RECORD_SEP).split(conf.get(CRUNCH_OUTPUTS))) {
+ List<String> fields = Lists.newArrayList(SPLITTER.split(input));
+ String name = fields.get(0);
+ FormatBundle<OutputFormat> bundle = FormatBundle.fromSerialized(fields.get(1),
+ OutputFormat.class);
+ try {
+ Class<?> keyClass = Class.forName(fields.get(2));
+ Class<?> valueClass = Class.forName(fields.get(3));
+ out.put(name, new OutputConfig(bundle, keyClass, valueClass));
+ } catch (ClassNotFoundException e) {
+ throw new CrunchRuntimeException(e);
+ }
+ }
+ return out;
+ }
+
+ private static final String BASE_OUTPUT_NAME = "mapreduce.output.basename";
+ private static final String COUNTERS_GROUP = CrunchOutputs.class.getName();
+
+ private TaskInputOutputContext<?, ?, K, V> baseContext;
+ private Map<String, OutputConfig> namedOutputs;
+ private Map<String, RecordWriter<K, V>> recordWriters;
+ private Map<String, TaskAttemptContext> taskContextCache;
+
+ /**
+ * Creates and initializes multiple outputs support,
+ * it should be instantiated in the Mapper/Reducer setup method.
+ *
+ * @param context the TaskInputOutputContext object
+ */
+ public CrunchOutputs(TaskInputOutputContext<?, ?, K, V> context) {
+ this.baseContext = context;
+ namedOutputs = getNamedOutputs(context);
+ recordWriters = Maps.newHashMap();
+ taskContextCache = Maps.newHashMap();
+ }
+
+ @SuppressWarnings("unchecked")
+ public void write(String namedOutput, K key, V value)
+ throws IOException, InterruptedException {
+ if (!namedOutputs.containsKey(namedOutput)) {
+ throw new IllegalArgumentException("Undefined named output '" +
+ namedOutput + "'");
+ }
+ TaskAttemptContext taskContext = getContext(namedOutput);
+ baseContext.getCounter(COUNTERS_GROUP, namedOutput).increment(1);
+ getRecordWriter(taskContext, namedOutput).write(key, value);
+ }
+
+ public void close() throws IOException, InterruptedException {
+ for (RecordWriter<?, ?> writer : recordWriters.values()) {
+ writer.close(baseContext);
+ }
+ }
+
+ private TaskAttemptContext getContext(String nameOutput) throws IOException {
+ TaskAttemptContext taskContext = taskContextCache.get(nameOutput);
+ if (taskContext != null) {
+ return taskContext;
+ }
+
+ // The following trick leverages the instantiation of a record writer via
+ // the job thus supporting arbitrary output formats.
+ OutputConfig outConfig = namedOutputs.get(nameOutput);
+ Configuration conf = new Configuration(baseContext.getConfiguration());
+ Job job = new Job(conf);
+ job.getConfiguration().set("crunch.namedoutput", nameOutput);
+ job.setOutputFormatClass(outConfig.bundle.getFormatClass());
+ job.setOutputKeyClass(outConfig.keyClass);
+ job.setOutputValueClass(outConfig.valueClass);
+ outConfig.bundle.configure(job.getConfiguration());
+ taskContext = TaskAttemptContextFactory.create(
+ job.getConfiguration(), baseContext.getTaskAttemptID());
+
+ taskContextCache.put(nameOutput, taskContext);
+ return taskContext;
+ }
+
+ private synchronized RecordWriter<K, V> getRecordWriter(
+ TaskAttemptContext taskContext, String namedOutput)
+ throws IOException, InterruptedException {
+ // look for record-writer in the cache
+ RecordWriter<K, V> writer = recordWriters.get(namedOutput);
+
+ // If not in cache, create a new one
+ if (writer == null) {
+ // get the record writer from context output format
+ taskContext.getConfiguration().set(BASE_OUTPUT_NAME, namedOutput);
+ try {
+ OutputFormat format = ReflectionUtils.newInstance(
+ taskContext.getOutputFormatClass(),
+ taskContext.getConfiguration());
+ writer = format.getRecordWriter(taskContext);
+ } catch (ClassNotFoundException e) {
+ throw new IOException(e);
+ }
+ recordWriters.put(namedOutput, writer);
+ }
+
+ return writer;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/FileNamingScheme.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/FileNamingScheme.java b/crunch-core/src/main/java/org/apache/crunch/io/FileNamingScheme.java
new file mode 100644
index 0000000..cf93651
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/FileNamingScheme.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * Encapsulates rules for naming output files. It is the responsibility of
+ * implementors to avoid file name collisions.
+ */
+public interface FileNamingScheme {
+
+ /**
+ * Get the output file name for a map task. Note that the implementation is
+ * responsible for avoiding naming collisions.
+ *
+ * @param configuration The configuration of the job for which the map output
+ * is being written
+ * @param outputDirectory The directory where the output will be written
+ * @return The filename for the output of the map task
+ * @throws IOException if an exception occurs while accessing the output file
+ * system
+ */
+ String getMapOutputName(Configuration configuration, Path outputDirectory) throws IOException;
+
+ /**
+ * Get the output file name for a reduce task. Note that the implementation is
+ * responsible for avoiding naming collisions.
+ *
+ * @param configuration The configuration of the job for which output is being
+ * written
+ * @param outputDirectory The directory where the file will be written
+ * @param partitionId The partition of the reduce task being output
+ * @return The filename for the output of the reduce task
+ * @throws IOException if an exception occurs while accessing output file
+ * system
+ */
+ String getReduceOutputName(Configuration configuration, Path outputDirectory, int partitionId) throws IOException;
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/FileReaderFactory.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/FileReaderFactory.java b/crunch-core/src/main/java/org/apache/crunch/io/FileReaderFactory.java
new file mode 100644
index 0000000..5cccb7b
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/FileReaderFactory.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import java.util.Iterator;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public interface FileReaderFactory<T> {
+ Iterator<T> read(FileSystem fs, Path path);
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/FormatBundle.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/FormatBundle.java b/crunch-core/src/main/java/org/apache/crunch/io/FormatBundle.java
new file mode 100644
index 0000000..d969009
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/FormatBundle.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.util.Map;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.OutputFormat;
+
+import com.google.common.collect.Maps;
+
+/**
+ * A combination of an {@link InputFormat} or {@link OutputFormat} and any extra
+ * configuration information that format class needs to run.
+ *
+ * <p>The {@code FormatBundle} allow us to let different formats act as
+ * if they are the only format that exists in a particular MapReduce job, even
+ * when we have multiple types of inputs and outputs within a single job.
+ */
+public class FormatBundle<K> implements Serializable {
+
+ private Class<K> formatClass;
+ private Map<String, String> extraConf;
+
+ public static <T> FormatBundle<T> fromSerialized(String serialized, Class<T> clazz) {
+ ByteArrayInputStream bais = new ByteArrayInputStream(Base64.decodeBase64(serialized));
+ try {
+ ObjectInputStream ois = new ObjectInputStream(bais);
+ FormatBundle<T> bundle = (FormatBundle<T>) ois.readObject();
+ ois.close();
+ return bundle;
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ } catch (ClassNotFoundException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public static <T extends InputFormat<?, ?>> FormatBundle<T> forInput(Class<T> inputFormatClass) {
+ return new FormatBundle<T>(inputFormatClass);
+ }
+
+ public static <T extends OutputFormat<?, ?>> FormatBundle<T> forOutput(Class<T> inputFormatClass) {
+ return new FormatBundle<T>(inputFormatClass);
+ }
+
+ private FormatBundle(Class<K> formatClass) {
+ this.formatClass = formatClass;
+ this.extraConf = Maps.newHashMap();
+ }
+
+ public FormatBundle<K> set(String key, String value) {
+ this.extraConf.put(key, value);
+ return this;
+ }
+
+ public Class<K> getFormatClass() {
+ return formatClass;
+ }
+
+ public Configuration configure(Configuration conf) {
+ for (Map.Entry<String, String> e : extraConf.entrySet()) {
+ conf.set(e.getKey(), e.getValue());
+ }
+ return conf;
+ }
+
+ public String serialize() {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ try {
+ ObjectOutputStream oos = new ObjectOutputStream(baos);
+ oos.writeObject(this);
+ oos.close();
+ return Base64.encodeBase64String(baos.toByteArray());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public String getName() {
+ return formatClass.getSimpleName();
+ }
+
+ @Override
+ public int hashCode() {
+ return new HashCodeBuilder().append(formatClass).append(extraConf).toHashCode();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == null || !(other instanceof FormatBundle)) {
+ return false;
+ }
+ FormatBundle<K> oib = (FormatBundle<K>) other;
+ return formatClass.equals(oib.formatClass) && extraConf.equals(oib.extraConf);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/From.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/From.java b/crunch-core/src/main/java/org/apache/crunch/io/From.java
new file mode 100644
index 0000000..e4cfb6a
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/From.java
@@ -0,0 +1,324 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import org.apache.avro.specific.SpecificRecord;
+import org.apache.crunch.Source;
+import org.apache.crunch.TableSource;
+import org.apache.crunch.io.avro.AvroFileSource;
+import org.apache.crunch.io.impl.FileTableSourceImpl;
+import org.apache.crunch.io.seq.SeqFileSource;
+import org.apache.crunch.io.seq.SeqFileTableSource;
+import org.apache.crunch.io.text.TextFileSource;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroType;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+
+/**
+ * <p>Static factory methods for creating common {@link Source} types.</p>
+ *
+ * <p>The {@code From} class is intended to provide a literate API for creating
+ * Crunch pipelines from common input file types.
+ *
+ * <code>
+ * Pipeline pipeline = new MRPipeline(this.getClass());
+ *
+ * // Reference the lines of a text file by wrapping the TextInputFormat class.
+ * PCollection<String> lines = pipeline.read(From.textFile("/path/to/myfiles"));
+ *
+ * // Reference entries from a sequence file where the key is a LongWritable and the
+ * // value is a custom Writable class.
+ * PTable<LongWritable, MyWritable> table = pipeline.read(From.sequenceFile(
+ * "/path/to/seqfiles", LongWritable.class, MyWritable.class));
+ *
+ * // Reference the records from an Avro file, where MyAvroObject implements Avro's
+ * // SpecificRecord interface.
+ * PCollection<MyAvroObject> myObjects = pipeline.read(From.avroFile("/path/to/avrofiles",
+ * MyAvroObject.class));
+ *
+ * // References the key-value pairs from a custom extension of FileInputFormat:
+ * PTable<KeyWritable, ValueWritable> custom = pipeline.read(From.formattedFile(
+ * "/custom", MyFileInputFormat.class, KeyWritable.class, ValueWritable.class));
+ * </code>
+ * </p>
+ */
+public class From {
+
+ /**
+ * Creates a {@code TableSource<K, V>} for reading data from files that have custom
+ * {@code FileInputFormat<K, V>} implementations not covered by the provided {@code TableSource}
+ * and {@code Source} factory methods.
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param formatClass The {@code FileInputFormat} implementation
+ * @param keyClass The {@code Writable} to use for the key
+ * @param valueClass The {@code Writable} to use for the value
+ * @return A new {@code TableSource<K, V>} instance
+ */
+ public static <K extends Writable, V extends Writable> TableSource<K, V> formattedFile(
+ String pathName, Class<? extends FileInputFormat<K, V>> formatClass,
+ Class<K> keyClass, Class<V> valueClass) {
+ return formattedFile(new Path(pathName), formatClass, keyClass, valueClass);
+ }
+
+ /**
+ * Creates a {@code TableSource<K, V>} for reading data from files that have custom
+ * {@code FileInputFormat<K, V>} implementations not covered by the provided {@code TableSource}
+ * and {@code Source} factory methods.
+ *
+ * @param The {@code Path} to the data
+ * @param formatClass The {@code FileInputFormat} implementation
+ * @param keyClass The {@code Writable} to use for the key
+ * @param valueClass The {@code Writable} to use for the value
+ * @return A new {@code TableSource<K, V>} instance
+ */
+ public static <K extends Writable, V extends Writable> TableSource<K, V> formattedFile(
+ Path path, Class<? extends FileInputFormat<K, V>> formatClass,
+ Class<K> keyClass, Class<V> valueClass) {
+ return formattedFile(path, formatClass, Writables.writables(keyClass),
+ Writables.writables(valueClass));
+ }
+
+ /**
+ * Creates a {@code TableSource<K, V>} for reading data from files that have custom
+ * {@code FileInputFormat} implementations not covered by the provided {@code TableSource}
+ * and {@code Source} factory methods.
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param formatClass The {@code FileInputFormat} implementation
+ * @param keyType The {@code PType} to use for the key
+ * @param valueType The {@code PType} to use for the value
+ * @return A new {@code TableSource<K, V>} instance
+ */
+ public static <K, V> TableSource<K, V> formattedFile(String pathName,
+ Class<? extends FileInputFormat<?, ?>> formatClass,
+ PType<K> keyType, PType<V> valueType) {
+ return formattedFile(new Path(pathName), formatClass, keyType, valueType);
+ }
+
+ /**
+ * Creates a {@code TableSource<K, V>} for reading data from files that have custom
+ * {@code FileInputFormat} implementations not covered by the provided {@code TableSource}
+ * and {@code Source} factory methods.
+ *
+ * @param The {@code Path} to the data
+ * @param formatClass The {@code FileInputFormat} implementation
+ * @param keyType The {@code PType} to use for the key
+ * @param valueType The {@code PType} to use for the value
+ * @return A new {@code TableSource<K, V>} instance
+ */
+ public static <K, V> TableSource<K, V> formattedFile(Path path,
+ Class<? extends FileInputFormat<?, ?>> formatClass,
+ PType<K> keyType, PType<V> valueType) {
+ PTableType<K, V> tableType = keyType.getFamily().tableOf(keyType, valueType);
+ return new FileTableSourceImpl<K, V>(path, tableType, formatClass);
+ }
+
+ /**
+ * Creates a {@code Source<T>} instance from the Avro file(s) at the given path name.
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param avroClass The subclass of {@code SpecificRecord} to use for the Avro file
+ * @return A new {@code Source<T>} instance
+ */
+ public static <T extends SpecificRecord> Source<T> avroFile(String pathName, Class<T> avroClass) {
+ return avroFile(new Path(pathName), avroClass);
+ }
+
+ /**
+ * Creates a {@code Source<T>} instance from the Avro file(s) at the given {@code Path}.
+ *
+ * @param path The {@code Path} to the data
+ * @param avroClass The subclass of {@code SpecificRecord} to use for the Avro file
+ * @return A new {@code Source<T>} instance
+ */
+ public static <T extends SpecificRecord> Source<T> avroFile(Path path, Class<T> avroClass) {
+ return avroFile(path, Avros.specifics(avroClass));
+ }
+
+ /**
+ * Creates a {@code Source<T>} instance from the Avro file(s) at the given path name.
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param avroType The {@code AvroType} for the Avro records
+ * @return A new {@code Source<T>} instance
+ */
+ public static <T> Source<T> avroFile(String pathName, AvroType<T> avroType) {
+ return avroFile(new Path(pathName), avroType);
+ }
+
+ /**
+ * Creates a {@code Source<T>} instance from the Avro file(s) at the given {@code Path}.
+ *
+ * @param path The {@code Path} to the data
+ * @param avroType The {@code AvroType} for the Avro records
+ * @return A new {@code Source<T>} instance
+ */
+ public static <T> Source<T> avroFile(Path path, AvroType<T> avroType) {
+ return new AvroFileSource<T>(path, avroType);
+ }
+
+ /**
+ * Creates a {@code Source<T>} instance from the SequenceFile(s) at the given path name
+ * from the value field of each key-value pair in the SequenceFile(s).
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param valueClass The {@code Writable} type for the value of the SequenceFile entry
+ * @return A new {@code Source<T>} instance
+ */
+ public static <T extends Writable> Source<T> sequenceFile(String pathName, Class<T> valueClass) {
+ return sequenceFile(new Path(pathName), valueClass);
+ }
+
+ /**
+ * Creates a {@code Source<T>} instance from the SequenceFile(s) at the given {@code Path}
+ * from the value field of each key-value pair in the SequenceFile(s).
+ *
+ * @param path The {@code Path} to the data
+ * @param valueClass The {@code Writable} type for the value of the SequenceFile entry
+ * @return A new {@code Source<T>} instance
+ */
+ public static <T extends Writable> Source<T> sequenceFile(Path path, Class<T> valueClass) {
+ return sequenceFile(path, Writables.writables(valueClass));
+ }
+
+ /**
+ * Creates a {@code Source<T>} instance from the SequenceFile(s) at the given path name
+ * from the value field of each key-value pair in the SequenceFile(s).
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param ptype The {@code PType} for the value of the SequenceFile entry
+ * @return A new {@code Source<T>} instance
+ */
+ public static <T> Source<T> sequenceFile(String pathName, PType<T> ptype) {
+ return sequenceFile(new Path(pathName), ptype);
+ }
+
+ /**
+ * Creates a {@code Source<T>} instance from the SequenceFile(s) at the given {@code Path}
+ * from the value field of each key-value pair in the SequenceFile(s).
+ *
+ * @param path The {@code Path} to the data
+ * @param ptype The {@code PType} for the value of the SequenceFile entry
+ * @return A new {@code Source<T>} instance
+ */
+ public static <T> Source<T> sequenceFile(Path path, PType<T> ptype) {
+ return new SeqFileSource<T>(path, ptype);
+ }
+
+ /**
+ * Creates a {@code TableSource<K, V>} instance for the SequenceFile(s) at the given path name.
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param keyClass The {@code Writable} subclass for the key of the SequenceFile entry
+ * @param valueClass The {@code Writable} subclass for the value of the SequenceFile entry
+ * @return A new {@code SourceTable<K, V>} instance
+ */
+ public static <K extends Writable, V extends Writable> TableSource<K, V> sequenceFile(
+ String pathName, Class<K> keyClass, Class<V> valueClass) {
+ return sequenceFile(new Path(pathName), keyClass, valueClass);
+ }
+
+ /**
+ * Creates a {@code TableSource<K, V>} instance for the SequenceFile(s) at the given {@code Path}.
+ *
+ * @param path The {@code Path} to the data
+ * @param keyClass The {@code Writable} subclass for the key of the SequenceFile entry
+ * @param valueClass The {@code Writable} subclass for the value of the SequenceFile entry
+ * @return A new {@code SourceTable<K, V>} instance
+ */
+ public static <K extends Writable, V extends Writable> TableSource<K, V> sequenceFile(
+ Path path, Class<K> keyClass, Class<V> valueClass) {
+ return sequenceFile(path, Writables.writables(keyClass), Writables.writables(valueClass));
+ }
+
+ /**
+ * Creates a {@code TableSource<K, V>} instance for the SequenceFile(s) at the given path name.
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param keyType The {@code PType} for the key of the SequenceFile entry
+ * @param valueType The {@code PType} for the value of the SequenceFile entry
+ * @return A new {@code SourceTable<K, V>} instance
+ */
+ public static <K, V> TableSource<K, V> sequenceFile(String pathName, PType<K> keyType, PType<V> valueType) {
+ return sequenceFile(new Path(pathName), keyType, valueType);
+ }
+
+ /**
+ * Creates a {@code TableSource<K, V>} instance for the SequenceFile(s) at the given {@code Path}.
+ *
+ * @param path The {@code Path} to the data
+ * @param keyType The {@code PType} for the key of the SequenceFile entry
+ * @param valueType The {@code PType} for the value of the SequenceFile entry
+ * @return A new {@code SourceTable<K, V>} instance
+ */
+ public static <K, V> TableSource<K, V> sequenceFile(Path path, PType<K> keyType, PType<V> valueType) {
+ PTypeFamily ptf = keyType.getFamily();
+ return new SeqFileTableSource<K, V>(path, ptf.tableOf(keyType, valueType));
+ }
+
+ /**
+ * Creates a {@code Source<String>} instance for the text file(s) at the given path name.
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @return A new {@code Source<String>} instance
+ */
+ public static Source<String> textFile(String pathName) {
+ return textFile(new Path(pathName));
+ }
+
+ /**
+ * Creates a {@code Source<String>} instance for the text file(s) at the given {@code Path}.
+ *
+ * @param path The {@code Path} to the data
+ * @return A new {@code Source<String>} instance
+ */
+ public static Source<String> textFile(Path path) {
+ return textFile(path, Writables.strings());
+ }
+
+ /**
+ * Creates a {@code Source<T>} instance for the text file(s) at the given path name using
+ * the provided {@code PType<T>} to convert the input text.
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param ptype The {@code PType<T>} to use to process the input text
+ * @return A new {@code Source<T>} instance
+ */
+ public static <T> Source<T> textFile(String pathName, PType<T> ptype) {
+ return textFile(new Path(pathName), ptype);
+ }
+
+ /**
+ * Creates a {@code Source<T>} instance for the text file(s) at the given {@code Path} using
+ * the provided {@code PType<T>} to convert the input text.
+ *
+ * @param path The {@code Path} to the data
+ * @param ptype The {@code PType<T>} to use to process the input text
+ * @return A new {@code Source<T>} instance
+ */
+ public static <T> Source<T> textFile(Path path, PType<T> ptype) {
+ return new TextFileSource<T>(path, ptype);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/MapReduceTarget.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/MapReduceTarget.java b/crunch-core/src/main/java/org/apache/crunch/io/MapReduceTarget.java
new file mode 100644
index 0000000..b484103
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/MapReduceTarget.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import org.apache.crunch.Target;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+
+public interface MapReduceTarget extends Target {
+ void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name);
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/OutputHandler.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/OutputHandler.java b/crunch-core/src/main/java/org/apache/crunch/io/OutputHandler.java
new file mode 100644
index 0000000..01d7f99
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/OutputHandler.java
@@ -0,0 +1,25 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import org.apache.crunch.Target;
+import org.apache.crunch.types.PType;
+
+public interface OutputHandler {
+ boolean configure(Target target, PType<?> ptype);
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/PathTarget.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/PathTarget.java b/crunch-core/src/main/java/org/apache/crunch/io/PathTarget.java
new file mode 100644
index 0000000..7a35209
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/PathTarget.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import org.apache.hadoop.fs.Path;
+
+/**
+ * A target whose output goes to a given path on a file system.
+ */
+public interface PathTarget extends MapReduceTarget {
+
+ Path getPath();
+
+ /**
+ * Get the naming scheme to be used for outputs being written to an output
+ * path.
+ *
+ * @return the naming scheme to be used
+ */
+ FileNamingScheme getFileNamingScheme();
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/PathTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/PathTargetImpl.java b/crunch-core/src/main/java/org/apache/crunch/io/PathTargetImpl.java
new file mode 100644
index 0000000..0be3f9a
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/PathTargetImpl.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+
+public abstract class PathTargetImpl implements PathTarget {
+
+ private final Path path;
+ private final Class<OutputFormat> outputFormatClass;
+ private final Class keyClass;
+ private final Class valueClass;
+
+ public PathTargetImpl(String path, Class<OutputFormat> outputFormatClass, Class keyClass, Class valueClass) {
+ this(new Path(path), outputFormatClass, keyClass, valueClass);
+ }
+
+ public PathTargetImpl(Path path, Class<OutputFormat> outputFormatClass, Class keyClass, Class valueClass) {
+ this.path = path;
+ this.outputFormatClass = outputFormatClass;
+ this.keyClass = keyClass;
+ this.valueClass = valueClass;
+ }
+
+ @Override
+ public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) {
+ try {
+ FileOutputFormat.setOutputPath(job, path);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ if (name == null) {
+ job.setOutputFormatClass(outputFormatClass);
+ job.setOutputKeyClass(keyClass);
+ job.setOutputValueClass(valueClass);
+ } else {
+ CrunchOutputs.addNamedOutput(job, name, outputFormatClass, keyClass, valueClass);
+ }
+ }
+
+ @Override
+ public Path getPath() {
+ return path;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/ReadableSource.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/ReadableSource.java b/crunch-core/src/main/java/org/apache/crunch/io/ReadableSource.java
new file mode 100644
index 0000000..0407167
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/ReadableSource.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import java.io.IOException;
+
+import org.apache.crunch.Source;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * An extension of the {@code Source} interface that indicates that a
+ * {@code Source} instance may be read as a series of records by the client
+ * code. This is used to determine whether a {@code PCollection} instance can be
+ * materialized.
+ */
+public interface ReadableSource<T> extends Source<T> {
+
+ /**
+ * Returns an {@code Iterable} that contains the contents of this source.
+ *
+ * @param conf The current {@code Configuration} instance
+ * @return the contents of this {@code Source} as an {@code Iterable} instance
+ * @throws IOException
+ */
+ Iterable<T> read(Configuration conf) throws IOException;
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/ReadableSourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/ReadableSourceTarget.java b/crunch-core/src/main/java/org/apache/crunch/io/ReadableSourceTarget.java
new file mode 100644
index 0000000..95c90aa
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/ReadableSourceTarget.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import org.apache.crunch.SourceTarget;
+
+/**
+ * An interface that indicates that a {@code SourceTarget} instance can be read
+ * into the local client.
+ *
+ * @param <T>
+ * The type of data read.
+ */
+public interface ReadableSourceTarget<T> extends ReadableSource<T>, SourceTarget<T> {
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/SequentialFileNamingScheme.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/SequentialFileNamingScheme.java b/crunch-core/src/main/java/org/apache/crunch/io/SequentialFileNamingScheme.java
new file mode 100644
index 0000000..bdda8e6
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/SequentialFileNamingScheme.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * Default {@link FileNamingScheme} that uses an incrementing sequence number in
+ * order to generate unique file names.
+ */
+public class SequentialFileNamingScheme implements FileNamingScheme {
+
+ @Override
+ public String getMapOutputName(Configuration configuration, Path outputDirectory) throws IOException {
+ return getSequentialFileName(configuration, outputDirectory, "m");
+ }
+
+ @Override
+ public String getReduceOutputName(Configuration configuration, Path outputDirectory, int partitionId)
+ throws IOException {
+ return getSequentialFileName(configuration, outputDirectory, "r");
+ }
+
+ private String getSequentialFileName(Configuration configuration, Path outputDirectory, String jobTypeName)
+ throws IOException {
+ FileSystem fileSystem = outputDirectory.getFileSystem(configuration);
+ int fileSequenceNumber = fileSystem.listStatus(outputDirectory).length;
+
+ return String.format("part-%s-%05d", jobTypeName, fileSequenceNumber);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/SourceTargetHelper.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/SourceTargetHelper.java b/crunch-core/src/main/java/org/apache/crunch/io/SourceTargetHelper.java
new file mode 100644
index 0000000..f4400de
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/SourceTargetHelper.java
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * Functions for configuring the inputs/outputs of MapReduce jobs.
+ *
+ */
+public class SourceTargetHelper {
+
+ public static long getPathSize(Configuration conf, Path path) throws IOException {
+ return getPathSize(path.getFileSystem(conf), path);
+ }
+
+ public static long getPathSize(FileSystem fs, Path path) throws IOException {
+ FileStatus[] stati = fs.globStatus(path);
+ if (stati == null || stati.length == 0) {
+ return -1L;
+ }
+ long size = 0;
+ for (FileStatus status : stati) {
+ size += fs.getContentSummary(status.getPath()).getLength();
+ }
+ return size;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/To.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/To.java b/crunch-core/src/main/java/org/apache/crunch/io/To.java
new file mode 100644
index 0000000..d62d294
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/To.java
@@ -0,0 +1,153 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import org.apache.crunch.Target;
+import org.apache.crunch.io.avro.AvroFileTarget;
+import org.apache.crunch.io.impl.FileTargetImpl;
+import org.apache.crunch.io.seq.SeqFileTarget;
+import org.apache.crunch.io.text.TextFileTarget;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+
+/**
+ * <p>Static factory methods for creating common {@link Target} types.</p>
+ *
+ * <p>The {@code To} class is intended to be used as part of a literate API
+ * for writing the output of Crunch pipelines to common file types. We can use
+ * the {@code Target} objects created by the factory methods in the {@code To}
+ * class with either the {@code write} method on the {@code Pipeline} class or
+ * the convenience {@code write} method on {@code PCollection} and {@code PTable}
+ * instances.
+ *
+ * <code>
+ * Pipeline pipeline = new MRPipeline(this.getClass());
+ * ...
+ * // Write a PCollection<String> to a text file:
+ * PCollection<String> words = ...;
+ * pipeline.write(words, To.textFile("/put/my/words/here"));
+ *
+ * // Write a PTable<Text, Text> to a sequence file:
+ * PTable<Text, Text> textToText = ...;
+ * textToText.write(To.sequenceFile("/words/to/words"));
+ *
+ * // Write a PCollection<MyAvroObject> to an Avro data file:
+ * PCollection<MyAvroObject> objects = ...;
+ * objects.write(To.avroFile("/my/avro/files"));
+ *
+ * // Write a PTable to a custom FileOutputFormat:
+ * PTable<KeyWritable, ValueWritable> custom = ...;
+ * pipeline.write(custom, To.formattedFile("/custom", MyFileFormat.class));
+ * </code>
+ * </p>
+ */
+public class To {
+
+ /**
+ * Creates a {@code Target} at the given path name that writes data to
+ * a custom {@code FileOutputFormat}.
+ *
+ * @param pathName The name of the path to write the data to on the filesystem
+ * @param formatClass The {@code FileOutputFormat<K, V>} to write the data to
+ * @return A new {@code Target} instance
+ */
+ public static <K extends Writable, V extends Writable> Target formattedFile(
+ String pathName, Class<? extends FileOutputFormat<K, V>> formatClass) {
+ return formattedFile(new Path(pathName), formatClass);
+ }
+
+ /**
+ * Creates a {@code Target} at the given {@code Path} that writes data to
+ * a custom {@code FileOutputFormat}.
+ *
+ * @param path The {@code Path} to write the data to
+ * @param formatClass The {@code FileOutputFormat} to write the data to
+ * @return A new {@code Target} instance
+ */
+ public static <K extends Writable, V extends Writable> Target formattedFile(
+ Path path, Class<? extends FileOutputFormat<K, V>> formatClass) {
+ return new FileTargetImpl(path, formatClass, new SequentialFileNamingScheme());
+ }
+
+ /**
+ * Creates a {@code Target} at the given path name that writes data to
+ * Avro files. The {@code PType} for the written data must be for Avro records.
+ *
+ * @param pathName The name of the path to write the data to on the filesystem
+ * @return A new {@code Target} instance
+ */
+ public static Target avroFile(String pathName) {
+ return avroFile(new Path(pathName));
+ }
+
+ /**
+ * Creates a {@code Target} at the given {@code Path} that writes data to
+ * Avro files. The {@code PType} for the written data must be for Avro records.
+ *
+ * @param path The {@code Path} to write the data to
+ * @return A new {@code Target} instance
+ */
+ public static Target avroFile(Path path) {
+ return new AvroFileTarget(path);
+ }
+
+ /**
+ * Creates a {@code Target} at the given path name that writes data to
+ * SequenceFiles.
+ *
+ * @param pathName The name of the path to write the data to on the filesystem
+ * @return A new {@code Target} instance
+ */
+ public static Target sequenceFile(String pathName) {
+ return sequenceFile(new Path(pathName));
+ }
+
+ /**
+ * Creates a {@code Target} at the given {@code Path} that writes data to
+ * SequenceFiles.
+ *
+ * @param path The {@code Path} to write the data to
+ * @return A new {@code Target} instance
+ */
+ public static Target sequenceFile(Path path) {
+ return new SeqFileTarget(path);
+ }
+
+ /**
+ * Creates a {@code Target} at the given path name that writes data to
+ * text files.
+ *
+ * @param pathName The name of the path to write the data to on the filesystem
+ * @return A new {@code Target} instance
+ */
+ public static Target textFile(String pathName) {
+ return textFile(new Path(pathName));
+ }
+
+ /**
+ * Creates a {@code Target} at the given {@code Path} that writes data to
+ * text files.
+ *
+ * @param path The {@code Path} to write the data to
+ * @return A new {@code Target} instance
+ */
+ public static Target textFile(Path path) {
+ return new TextFileTarget(path);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileReaderFactory.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileReaderFactory.java b/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileReaderFactory.java
new file mode 100644
index 0000000..c8fe23a
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileReaderFactory.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.avro;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.avro.Schema;
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.generic.GenericDatumReader;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.mapred.FsInput;
+import org.apache.avro.reflect.ReflectDatumReader;
+import org.apache.avro.specific.SpecificDatumReader;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.fn.IdentityFn;
+import org.apache.crunch.io.FileReaderFactory;
+import org.apache.crunch.io.impl.AutoClosingIterator;
+import org.apache.crunch.types.avro.AvroType;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import com.google.common.collect.Iterators;
+import com.google.common.collect.UnmodifiableIterator;
+
+public class AvroFileReaderFactory<T> implements FileReaderFactory<T> {
+
+ private static final Log LOG = LogFactory.getLog(AvroFileReaderFactory.class);
+
+ private final DatumReader<T> recordReader;
+ private final MapFn<T, T> mapFn;
+
+ public AvroFileReaderFactory(AvroType<T> atype) {
+ this.recordReader = createDatumReader(atype);
+ this.mapFn = (MapFn<T, T>) atype.getInputMapFn();
+ }
+
+ public AvroFileReaderFactory(Schema schema) {
+ this.recordReader = new GenericDatumReader<T>(schema);
+ this.mapFn = IdentityFn.<T>getInstance();
+ }
+
+ static <T> DatumReader<T> createDatumReader(AvroType<T> avroType) {
+ if (avroType.hasReflect()) {
+ if (avroType.hasSpecific()) {
+ Avros.checkCombiningSpecificAndReflectionSchemas();
+ }
+ return new ReflectDatumReader<T>(avroType.getSchema());
+ } else if (avroType.hasSpecific()) {
+ return new SpecificDatumReader<T>(avroType.getSchema());
+ } else {
+ return new GenericDatumReader<T>(avroType.getSchema());
+ }
+ }
+
+ @Override
+ public Iterator<T> read(FileSystem fs, final Path path) {
+ this.mapFn.initialize();
+ try {
+ FsInput fsi = new FsInput(path, fs.getConf());
+ final DataFileReader<T> reader = new DataFileReader<T>(fsi, recordReader);
+ return new AutoClosingIterator<T>(reader, new UnmodifiableIterator<T>() {
+ @Override
+ public boolean hasNext() {
+ return reader.hasNext();
+ }
+
+ @Override
+ public T next() {
+ return mapFn.map(reader.next());
+ }
+ });
+ } catch (IOException e) {
+ LOG.info("Could not read avro file at path: " + path, e);
+ return Iterators.emptyIterator();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileSource.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileSource.java b/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileSource.java
new file mode 100644
index 0000000..15792bf
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileSource.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.avro;
+
+import java.io.IOException;
+
+import org.apache.avro.mapred.AvroJob;
+import org.apache.crunch.io.CompositePathIterable;
+import org.apache.crunch.io.FormatBundle;
+import org.apache.crunch.io.ReadableSource;
+import org.apache.crunch.io.impl.FileSourceImpl;
+import org.apache.crunch.types.avro.AvroInputFormat;
+import org.apache.crunch.types.avro.AvroType;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public class AvroFileSource<T> extends FileSourceImpl<T> implements ReadableSource<T> {
+
+ private static <S> FormatBundle getBundle(AvroType<S> ptype) {
+ FormatBundle bundle = FormatBundle.forInput(AvroInputFormat.class)
+ .set(AvroJob.INPUT_IS_REFLECT, String.valueOf(ptype.hasReflect()))
+ .set(AvroJob.INPUT_SCHEMA, ptype.getSchema().toString())
+ .set(Avros.REFLECT_DATA_FACTORY_CLASS, Avros.REFLECT_DATA_FACTORY.getClass().getName());
+ return bundle;
+ }
+
+ public AvroFileSource(Path path, AvroType<T> ptype) {
+ super(path, ptype, getBundle(ptype));
+ }
+
+ @Override
+ public String toString() {
+ return "Avro(" + path.toString() + ")";
+ }
+
+ @Override
+ public Iterable<T> read(Configuration conf) throws IOException {
+ FileSystem fs = path.getFileSystem(conf);
+ return CompositePathIterable.create(fs, path, new AvroFileReaderFactory<T>((AvroType<T>) ptype));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileSourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileSourceTarget.java b/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileSourceTarget.java
new file mode 100644
index 0000000..76103e5
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileSourceTarget.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.avro;
+
+import org.apache.crunch.io.FileNamingScheme;
+import org.apache.crunch.io.SequentialFileNamingScheme;
+import org.apache.crunch.io.impl.ReadableSourcePathTargetImpl;
+import org.apache.crunch.types.avro.AvroType;
+import org.apache.hadoop.fs.Path;
+
+public class AvroFileSourceTarget<T> extends ReadableSourcePathTargetImpl<T> {
+ public AvroFileSourceTarget(Path path, AvroType<T> atype) {
+ this(path, atype, new SequentialFileNamingScheme());
+ }
+
+ public AvroFileSourceTarget(Path path, AvroType<T> atype, FileNamingScheme fileNamingScheme) {
+ super(new AvroFileSource<T>(path, atype), new AvroFileTarget(path), fileNamingScheme);
+ }
+
+ @Override
+ public String toString() {
+ return target.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileTarget.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileTarget.java b/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileTarget.java
new file mode 100644
index 0000000..3a9e42c
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/avro/AvroFileTarget.java
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.avro;
+
+import org.apache.avro.mapred.AvroWrapper;
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.io.FileNamingScheme;
+import org.apache.crunch.io.OutputHandler;
+import org.apache.crunch.io.SequentialFileNamingScheme;
+import org.apache.crunch.io.impl.FileTargetImpl;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.avro.AvroOutputFormat;
+import org.apache.crunch.types.avro.AvroType;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapreduce.Job;
+
+public class AvroFileTarget extends FileTargetImpl {
+
+ public AvroFileTarget(String path) {
+ this(new Path(path));
+ }
+
+ public AvroFileTarget(Path path) {
+ this(path, new SequentialFileNamingScheme());
+ }
+
+ public AvroFileTarget(Path path, FileNamingScheme fileNamingScheme) {
+ super(path, AvroOutputFormat.class, fileNamingScheme);
+ }
+
+ @Override
+ public String toString() {
+ return "Avro(" + path.toString() + ")";
+ }
+
+ @Override
+ public boolean accept(OutputHandler handler, PType<?> ptype) {
+ if (!(ptype instanceof AvroType)) {
+ return false;
+ }
+ handler.configure(this, ptype);
+ return true;
+ }
+
+ @Override
+ public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) {
+ AvroType<?> atype = (AvroType<?>) ptype;
+ Configuration conf = job.getConfiguration();
+ String schemaParam = null;
+ if (name == null) {
+ schemaParam = "avro.output.schema";
+ } else {
+ schemaParam = "avro.output.schema." + name;
+ }
+ String outputSchema = conf.get(schemaParam);
+ if (outputSchema == null) {
+ conf.set(schemaParam, atype.getSchema().toString());
+ } else if (!outputSchema.equals(atype.getSchema().toString())) {
+ throw new IllegalStateException("Avro targets must use the same output schema");
+ }
+ Avros.configureReflectDataFactory(conf);
+ configureForMapReduce(job, AvroWrapper.class, NullWritable.class, AvroOutputFormat.class,
+ outputPath, name);
+ }
+
+ @Override
+ public <T> SourceTarget<T> asSourceTarget(PType<T> ptype) {
+ if (ptype instanceof AvroType) {
+ return new AvroFileSourceTarget<T>(path, (AvroType<T>) ptype);
+ }
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/impl/AutoClosingIterator.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/impl/AutoClosingIterator.java b/crunch-core/src/main/java/org/apache/crunch/io/impl/AutoClosingIterator.java
new file mode 100644
index 0000000..3bd802e
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/impl/AutoClosingIterator.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.impl;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Iterator;
+
+import com.google.common.collect.UnmodifiableIterator;
+import com.google.common.io.Closeables;
+
+/**
+ * Closes the wrapped {@code Closeable} when {@link #hasNext()} returns false. As long a client loops through to
+ * completion (doesn't abort early due to an exception, short circuit, etc.) resources will be closed automatically.
+ */
+public class AutoClosingIterator<T> extends UnmodifiableIterator<T> implements Closeable {
+ private final Iterator<T> iter;
+ private Closeable closeable;
+
+ public AutoClosingIterator(Closeable closeable, Iterator<T> iter) {
+ this.closeable = closeable;
+ this.iter = iter;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (!iter.hasNext()) {
+ Closeables.closeQuietly(this);
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ @Override
+ public T next() {
+ return iter.next();
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (closeable != null) {
+ closeable.close();
+ closeable = null;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/impl/FileSourceImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/impl/FileSourceImpl.java b/crunch-core/src/main/java/org/apache/crunch/io/impl/FileSourceImpl.java
new file mode 100644
index 0000000..688c801
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/impl/FileSourceImpl.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.impl;
+
+import java.io.IOException;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.Source;
+import org.apache.crunch.io.CrunchInputs;
+import org.apache.crunch.io.FormatBundle;
+import org.apache.crunch.io.SourceTargetHelper;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+
+public class FileSourceImpl<T> implements Source<T> {
+
+ private static final Log LOG = LogFactory.getLog(FileSourceImpl.class);
+
+ protected final Path path;
+ protected final PType<T> ptype;
+ protected final FormatBundle<? extends InputFormat> inputBundle;
+
+ public FileSourceImpl(Path path, PType<T> ptype, Class<? extends InputFormat> inputFormatClass) {
+ this.path = path;
+ this.ptype = ptype;
+ this.inputBundle = FormatBundle.forInput(inputFormatClass);
+ }
+
+ public FileSourceImpl(Path path, PType<T> ptype, FormatBundle<? extends InputFormat> inputBundle) {
+ this.path = path;
+ this.ptype = ptype;
+ this.inputBundle = inputBundle;
+ }
+
+ public Path getPath() {
+ return path;
+ }
+
+ @Override
+ public void configureSource(Job job, int inputId) throws IOException {
+ if (inputId == -1) {
+ FileInputFormat.addInputPath(job, path);
+ job.setInputFormatClass(inputBundle.getFormatClass());
+ inputBundle.configure(job.getConfiguration());
+ } else {
+ CrunchInputs.addInputPath(job, path, inputBundle, inputId);
+ }
+ }
+
+ @Override
+ public PType<T> getType() {
+ return ptype;
+ }
+
+ @Override
+ public long getSize(Configuration configuration) {
+ try {
+ return SourceTargetHelper.getPathSize(configuration, path);
+ } catch (IOException e) {
+ LOG.warn(String.format("Exception thrown looking up size of: %s", path), e);
+ throw new IllegalStateException("Failed to get the file size of:" + path, e);
+ }
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == null || !getClass().equals(other.getClass())) {
+ return false;
+ }
+ FileSourceImpl o = (FileSourceImpl) other;
+ return ptype.equals(o.ptype) && path.equals(o.path) && inputBundle.equals(o.inputBundle);
+ }
+
+ @Override
+ public int hashCode() {
+ return new HashCodeBuilder().append(ptype).append(path).append(inputBundle).toHashCode();
+ }
+
+ @Override
+ public String toString() {
+ return new StringBuilder().append(inputBundle.getName()).append("(").append(path).append(")").toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/impl/FileTableSourceImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/impl/FileTableSourceImpl.java b/crunch-core/src/main/java/org/apache/crunch/io/impl/FileTableSourceImpl.java
new file mode 100644
index 0000000..295edb5
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/impl/FileTableSourceImpl.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.impl;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.TableSource;
+import org.apache.crunch.io.FormatBundle;
+import org.apache.crunch.types.PTableType;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+
+public class FileTableSourceImpl<K, V> extends FileSourceImpl<Pair<K, V>> implements TableSource<K, V> {
+
+ public FileTableSourceImpl(Path path, PTableType<K, V> tableType, Class<? extends FileInputFormat> formatClass) {
+ super(path, tableType, formatClass);
+ }
+
+ public FileTableSourceImpl(Path path, PTableType<K, V> tableType, FormatBundle bundle) {
+ super(path, tableType, bundle);
+ }
+
+ @Override
+ public PTableType<K, V> getTableType() {
+ return (PTableType<K, V>) getType();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/impl/FileTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/impl/FileTargetImpl.java b/crunch-core/src/main/java/org/apache/crunch/io/impl/FileTargetImpl.java
new file mode 100644
index 0000000..c1c29e4
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/impl/FileTargetImpl.java
@@ -0,0 +1,162 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.impl;
+
+import java.io.IOException;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.io.CrunchOutputs;
+import org.apache.crunch.io.FileNamingScheme;
+import org.apache.crunch.io.OutputHandler;
+import org.apache.crunch.io.PathTarget;
+import org.apache.crunch.types.Converter;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+
+public class FileTargetImpl implements PathTarget {
+
+ private static final Log LOG = LogFactory.getLog(FileTargetImpl.class);
+
+ protected final Path path;
+ private final Class<? extends FileOutputFormat> outputFormatClass;
+ private final FileNamingScheme fileNamingScheme;
+
+ public FileTargetImpl(Path path, Class<? extends FileOutputFormat> outputFormatClass,
+ FileNamingScheme fileNamingScheme) {
+ this.path = path;
+ this.outputFormatClass = outputFormatClass;
+ this.fileNamingScheme = fileNamingScheme;
+ }
+
+ @Override
+ public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) {
+ Converter converter = ptype.getConverter();
+ Class keyClass = converter.getKeyClass();
+ Class valueClass = converter.getValueClass();
+ configureForMapReduce(job, keyClass, valueClass, outputFormatClass, outputPath, name);
+ }
+
+ protected void configureForMapReduce(Job job, Class keyClass, Class valueClass,
+ Class outputFormatClass, Path outputPath, String name) {
+ try {
+ FileOutputFormat.setOutputPath(job, outputPath);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ if (name == null) {
+ job.setOutputFormatClass(outputFormatClass);
+ job.setOutputKeyClass(keyClass);
+ job.setOutputValueClass(valueClass);
+ } else {
+ CrunchOutputs.addNamedOutput(job, name, outputFormatClass, keyClass, valueClass);
+ }
+ }
+
+ @Override
+ public boolean accept(OutputHandler handler, PType<?> ptype) {
+ handler.configure(this, ptype);
+ return true;
+ }
+
+ @Override
+ public Path getPath() {
+ return path;
+ }
+
+ @Override
+ public FileNamingScheme getFileNamingScheme() {
+ return fileNamingScheme;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == null || !getClass().equals(other.getClass())) {
+ return false;
+ }
+ FileTargetImpl o = (FileTargetImpl) other;
+ return path.equals(o.path);
+ }
+
+ @Override
+ public int hashCode() {
+ return new HashCodeBuilder().append(path).toHashCode();
+ }
+
+ @Override
+ public String toString() {
+ return new StringBuilder().append(outputFormatClass.getSimpleName()).append("(").append(path).append(")")
+ .toString();
+ }
+
+ @Override
+ public <T> SourceTarget<T> asSourceTarget(PType<T> ptype) {
+ // By default, assume that we cannot do this.
+ return null;
+ }
+
+ @Override
+ public void handleExisting(WriteMode strategy, Configuration conf) {
+ FileSystem fs = null;
+ try {
+ fs = FileSystem.get(conf);
+ } catch (IOException e) {
+ LOG.error("Could not retrieve FileSystem object to check for existing path", e);
+ throw new CrunchRuntimeException(e);
+ }
+
+ boolean exists = false;
+ try {
+ exists = fs.exists(path);
+ } catch (IOException e) {
+ LOG.error("Exception checking existence of path: " + path, e);
+ throw new CrunchRuntimeException(e);
+ }
+
+ if (exists) {
+ switch (strategy) {
+ case DEFAULT:
+ LOG.error("Path " + path + " already exists!");
+ throw new CrunchRuntimeException("Path already exists: " + path);
+ case OVERWRITE:
+ LOG.info("Removing data at existing path: " + path);
+ try {
+ fs.delete(path, true);
+ } catch (IOException e) {
+ LOG.error("Exception thrown removing data at path: " + path, e);
+ }
+ break;
+ case APPEND:
+ LOG.info("Adding output files to existing path: " + path);
+ break;
+ default:
+ throw new CrunchRuntimeException("Unknown WriteMode: " + strategy);
+ }
+ } else {
+ LOG.info("Will write output files to new path: " + path);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/impl/ReadableSourcePathTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/impl/ReadableSourcePathTargetImpl.java b/crunch-core/src/main/java/org/apache/crunch/io/impl/ReadableSourcePathTargetImpl.java
new file mode 100644
index 0000000..6506816
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/impl/ReadableSourcePathTargetImpl.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.impl;
+
+import java.io.IOException;
+
+import org.apache.crunch.io.FileNamingScheme;
+import org.apache.crunch.io.PathTarget;
+import org.apache.crunch.io.ReadableSource;
+import org.apache.crunch.io.ReadableSourceTarget;
+import org.apache.hadoop.conf.Configuration;
+
+public class ReadableSourcePathTargetImpl<T> extends SourcePathTargetImpl<T> implements ReadableSourceTarget<T> {
+
+ public ReadableSourcePathTargetImpl(ReadableSource<T> source, PathTarget target, FileNamingScheme fileNamingScheme) {
+ super(source, target, fileNamingScheme);
+ }
+
+ @Override
+ public Iterable<T> read(Configuration conf) throws IOException {
+ return ((ReadableSource<T>) source).read(conf);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/impl/ReadableSourceTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/impl/ReadableSourceTargetImpl.java b/crunch-core/src/main/java/org/apache/crunch/io/impl/ReadableSourceTargetImpl.java
new file mode 100644
index 0000000..f435b3b
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/impl/ReadableSourceTargetImpl.java
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.impl;
+
+import java.io.IOException;
+
+import org.apache.crunch.Target;
+import org.apache.crunch.io.ReadableSource;
+import org.apache.crunch.io.ReadableSourceTarget;
+import org.apache.hadoop.conf.Configuration;
+
+public class ReadableSourceTargetImpl<T> extends SourceTargetImpl<T> implements ReadableSourceTarget<T> {
+
+ public ReadableSourceTargetImpl(ReadableSource<T> source, Target target) {
+ super(source, target);
+ }
+
+ @Override
+ public Iterable<T> read(Configuration conf) throws IOException {
+ return ((ReadableSource<T>) source).read(conf);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/impl/SourcePathTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/impl/SourcePathTargetImpl.java b/crunch-core/src/main/java/org/apache/crunch/io/impl/SourcePathTargetImpl.java
new file mode 100644
index 0000000..c0d7ce0
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/impl/SourcePathTargetImpl.java
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.impl;
+
+import org.apache.crunch.Source;
+import org.apache.crunch.io.FileNamingScheme;
+import org.apache.crunch.io.PathTarget;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+
+public class SourcePathTargetImpl<T> extends SourceTargetImpl<T> implements PathTarget {
+
+ private final FileNamingScheme fileNamingScheme;
+
+ public SourcePathTargetImpl(Source<T> source, PathTarget target, FileNamingScheme fileNamingScheme) {
+ super(source, target);
+ this.fileNamingScheme = fileNamingScheme;
+ }
+
+ @Override
+ public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) {
+ ((PathTarget) target).configureForMapReduce(job, ptype, outputPath, name);
+ }
+
+ @Override
+ public Path getPath() {
+ return ((PathTarget) target).getPath();
+ }
+
+ @Override
+ public FileNamingScheme getFileNamingScheme() {
+ return fileNamingScheme;
+ }
+}
[16/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/shakes.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/shakes.txt b/crunch/src/it/resources/shakes.txt
deleted file mode 100644
index 63acf18..0000000
--- a/crunch/src/it/resources/shakes.txt
+++ /dev/null
@@ -1,3667 +0,0 @@
-***The Project Gutenberg's Etext of Shakespeare's First Folio***
-********************The Tragedie of Macbeth*********************
-
-This is our 3rd edition of most of these plays. See the index.
-
-
-Copyright laws are changing all over the world, be sure to check
-the copyright laws for your country before posting these files!!
-
-Please take a look at the important information in this header.
-We encourage you to keep this file on your own disk, keeping an
-electronic path open for the next readers. Do not remove this.
-
-
-**Welcome To The World of Free Plain Vanilla Electronic Texts**
-
-**Etexts Readable By Both Humans and By Computers, Since 1971**
-
-*These Etexts Prepared By Hundreds of Volunteers and Donations*
-
-Information on contacting Project Gutenberg to get Etexts, and
-further information is included below. We need your donations.
-
-
-The Tragedie of Macbeth
-
-by William Shakespeare
-
-July, 2000 [Etext #2264]
-
-
-***The Project Gutenberg's Etext of Shakespeare's First Folio***
-********************The Tragedie of Macbeth*********************
-
-*****This file should be named 0ws3410.txt or 0ws3410.zip******
-
-Corrected EDITIONS of our etexts get a new NUMBER, 0ws3411.txt
-VERSIONS based on separate sources get new LETTER, 0ws3410a.txt
-
-
-Project Gutenberg Etexts are usually created from multiple editions,
-all of which are in the Public Domain in the United States, unless a
-copyright notice is included. Therefore, we usually do NOT keep any
-of these books in compliance with any particular paper edition.
-
-
-We are now trying to release all our books one month in advance
-of the official release dates, leaving time for better editing.
-
-Please note: neither this list nor its contents are final till
-midnight of the last day of the month of any such announcement.
-The official release date of all Project Gutenberg Etexts is at
-Midnight, Central Time, of the last day of the stated month. A
-preliminary version may often be posted for suggestion, comment
-and editing by those who wish to do so. To be sure you have an
-up to date first edition [xxxxx10x.xxx] please check file sizes
-in the first week of the next month. Since our ftp program has
-a bug in it that scrambles the date [tried to fix and failed] a
-look at the file size will have to do, but we will try to see a
-new copy has at least one byte more or less.
-
-
-Information about Project Gutenberg (one page)
-
-We produce about two million dollars for each hour we work. The
-time it takes us, a rather conservative estimate, is fifty hours
-to get any etext selected, entered, proofread, edited, copyright
-searched and analyzed, the copyright letters written, etc. This
-projected audience is one hundred million readers. If our value
-per text is nominally estimated at one dollar then we produce $2
-million dollars per hour this year as we release thirty-six text
-files per month, or 432 more Etexts in 1999 for a total of 2000+
-If these reach just 10% of the computerized population, then the
-total should reach over 200 billion Etexts given away this year.
-
-The Goal of Project Gutenberg is to Give Away One Trillion Etext
-Files by December 31, 2001. [10,000 x 100,000,000 = 1 Trillion]
-This is ten thousand titles each to one hundred million readers,
-which is only ~5% of the present number of computer users.
-
-At our revised rates of production, we will reach only one-third
-of that goal by the end of 2001, or about 3,333 Etexts unless we
-manage to get some real funding; currently our funding is mostly
-from Michael Hart's salary at Carnegie-Mellon University, and an
-assortment of sporadic gifts; this salary is only good for a few
-more years, so we are looking for something to replace it, as we
-don't want Project Gutenberg to be so dependent on one person.
-
-We need your donations more than ever!
-
-
-All donations should be made to "Project Gutenberg/CMU": and are
-tax deductible to the extent allowable by law. (CMU = Carnegie-
-Mellon University).
-
-For these and other matters, please mail to:
-
-Project Gutenberg
-P. O. Box 2782
-Champaign, IL 61825
-
-When all other email fails. . .try our Executive Director:
-Michael S. Hart <ha...@pobox.com>
-hart@pobox.com forwards to hart@prairienet.org and archive.org
-if your mail bounces from archive.org, I will still see it, if
-it bounces from prairienet.org, better resend later on. . . .
-
-We would prefer to send you this information by email.
-
-******
-
-To access Project Gutenberg etexts, use any Web browser
-to view http://promo.net/pg. This site lists Etexts by
-author and by title, and includes information about how
-to get involved with Project Gutenberg. You could also
-download our past Newsletters, or subscribe here. This
-is one of our major sites, please email hart@pobox.com,
-for a more complete list of our various sites.
-
-To go directly to the etext collections, use FTP or any
-Web browser to visit a Project Gutenberg mirror (mirror
-sites are available on 7 continents; mirrors are listed
-at http://promo.net/pg).
-
-Mac users, do NOT point and click, typing works better.
-
-Example FTP session:
-
-ftp sunsite.unc.edu
-login: anonymous
-password: your@login
-cd pub/docs/books/gutenberg
-cd etext90 through etext99
-dir [to see files]
-get or mget [to get files. . .set bin for zip files]
-GET GUTINDEX.?? [to get a year's listing of books, e.g., GUTINDEX.99]
-GET GUTINDEX.ALL [to get a listing of ALL books]
-
-***
-
-**Information prepared by the Project Gutenberg legal advisor**
-
-(Three Pages)
-
-
-***START**THE SMALL PRINT!**FOR PUBLIC DOMAIN ETEXTS**START***
-Why is this "Small Print!" statement here? You know: lawyers.
-They tell us you might sue us if there is something wrong with
-your copy of this etext, even if you got it for free from
-someone other than us, and even if what's wrong is not our
-fault. So, among other things, this "Small Print!" statement
-disclaims most of our liability to you. It also tells you how
-you can distribute copies of this etext if you want to.
-
-*BEFORE!* YOU USE OR READ THIS ETEXT
-By using or reading any part of this PROJECT GUTENBERG-tm
-etext, you indicate that you understand, agree to and accept
-this "Small Print!" statement. If you do not, you can receive
-a refund of the money (if any) you paid for this etext by
-sending a request within 30 days of receiving it to the person
-you got it from. If you received this etext on a physical
-medium (such as a disk), you must return it with your request.
-
-ABOUT PROJECT GUTENBERG-TM ETEXTS
-This PROJECT GUTENBERG-tm etext, like most PROJECT GUTENBERG-
-tm etexts, is a "public domain" work distributed by Professor
-Michael S. Hart through the Project Gutenberg Association at
-Carnegie-Mellon University (the "Project"). Among other
-things, this means that no one owns a United States copyright
-on or for this work, so the Project (and you!) can copy and
-distribute it in the United States without permission and
-without paying copyright royalties. Special rules, set forth
-below, apply if you wish to copy and distribute this etext
-under the Project's "PROJECT GUTENBERG" trademark.
-
-To create these etexts, the Project expends considerable
-efforts to identify, transcribe and proofread public domain
-works. Despite these efforts, the Project's etexts and any
-medium they may be on may contain "Defects". Among other
-things, Defects may take the form of incomplete, inaccurate or
-corrupt data, transcription errors, a copyright or other
-intellectual property infringement, a defective or damaged
-disk or other etext medium, a computer virus, or computer
-codes that damage or cannot be read by your equipment.
-
-LIMITED WARRANTY; DISCLAIMER OF DAMAGES
-But for the "Right of Replacement or Refund" described below,
-[1] the Project (and any other party you may receive this
-etext from as a PROJECT GUTENBERG-tm etext) disclaims all
-liability to you for damages, costs and expenses, including
-legal fees, and [2] YOU HAVE NO REMEDIES FOR NEGLIGENCE OR
-UNDER STRICT LIABILITY, OR FOR BREACH OF WARRANTY OR CONTRACT,
-INCLUDING BUT NOT LIMITED TO INDIRECT, CONSEQUENTIAL, PUNITIVE
-OR INCIDENTAL DAMAGES, EVEN IF YOU GIVE NOTICE OF THE
-POSSIBILITY OF SUCH DAMAGES.
-
-If you discover a Defect in this etext within 90 days of
-receiving it, you can receive a refund of the money (if any)
-you paid for it by sending an explanatory note within that
-time to the person you received it from. If you received it
-on a physical medium, you must return it with your note, and
-such person may choose to alternatively give you a replacement
-copy. If you received it electronically, such person may
-choose to alternatively give you a second opportunity to
-receive it electronically.
-
-THIS ETEXT IS OTHERWISE PROVIDED TO YOU "AS-IS". NO OTHER
-WARRANTIES OF ANY KIND, EXPRESS OR IMPLIED, ARE MADE TO YOU AS
-TO THE ETEXT OR ANY MEDIUM IT MAY BE ON, INCLUDING BUT NOT
-LIMITED TO WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A
-PARTICULAR PURPOSE.
-
-Some states do not allow disclaimers of implied warranties or
-the exclusion or limitation of consequential damages, so the
-above disclaimers and exclusions may not apply to you, and you
-may have other legal rights.
-
-INDEMNITY
-You will indemnify and hold the Project, its directors,
-officers, members and agents harmless from all liability, cost
-and expense, including legal fees, that arise directly or
-indirectly from any of the following that you do or cause:
-[1] distribution of this etext, [2] alteration, modification,
-or addition to the etext, or [3] any Defect.
-
-DISTRIBUTION UNDER "PROJECT GUTENBERG-tm"
-You may distribute copies of this etext electronically, or by
-disk, book or any other medium if you either delete this
-"Small Print!" and all other references to Project Gutenberg,
-or:
-
-[1] Only give exact copies of it. Among other things, this
- requires that you do not remove, alter or modify the
- etext or this "small print!" statement. You may however,
- if you wish, distribute this etext in machine readable
- binary, compressed, mark-up, or proprietary form,
- including any form resulting from conversion by word pro-
- cessing or hypertext software, but only so long as
- *EITHER*:
-
- [*] The etext, when displayed, is clearly readable, and
- does *not* contain characters other than those
- intended by the author of the work, although tilde
- (~), asterisk (*) and underline (_) characters may
- be used to convey punctuation intended by the
- author, and additional characters may be used to
- indicate hypertext links; OR
-
- [*] The etext may be readily converted by the reader at
- no expense into plain ASCII, EBCDIC or equivalent
- form by the program that displays the etext (as is
- the case, for instance, with most word processors);
- OR
-
- [*] You provide, or agree to also provide on request at
- no additional cost, fee or expense, a copy of the
- etext in its original plain ASCII form (or in EBCDIC
- or other equivalent proprietary form).
-
-[2] Honor the etext refund and replacement provisions of this
- "Small Print!" statement.
-
-[3] Pay a trademark license fee to the Project of 20% of the
- net profits you derive calculated using the method you
- already use to calculate your applicable taxes. If you
- don't derive profits, no royalty is due. Royalties are
- payable to "Project Gutenberg Association/Carnegie-Mellon
- University" within the 60 days following each
- date you prepare (or were legally required to prepare)
- your annual (or equivalent periodic) tax return.
-
-WHAT IF YOU *WANT* TO SEND MONEY EVEN IF YOU DON'T HAVE TO?
-The Project gratefully accepts contributions in money, time,
-scanning machines, OCR software, public domain etexts, royalty
-free copyright licenses, and every other sort of contribution
-you can think of. Money should be paid to "Project Gutenberg
-Association / Carnegie-Mellon University".
-
-*END*THE SMALL PRINT! FOR PUBLIC DOMAIN ETEXTS*Ver.04.29.93*END*
-
-
-
-
-
-Project Gutenberg's Etext of Shakespeare's The Tragedie of Macbeth
-
-
-
-
-
-Executive Director's Notes:
-
-In addition to the notes below, and so you will *NOT* think all
-the spelling errors introduced by the printers of the time have
-been corrected, here are the first few lines of Hamlet, as they
-are presented herein:
-
- Barnardo. Who's there?
- Fran. Nay answer me: Stand & vnfold
-your selfe
-
- Bar. Long liue the King
-
-***
-
-As I understand it, the printers often ran out of certain words
-or letters they had often packed into a "cliche". . .this is the
-original meaning of the term cliche. . .and thus, being unwilling
-to unpack the cliches, and thus you will see some substitutions
-that look very odd. . .such as the exchanges of u for v, v for u,
-above. . .and you may wonder why they did it this way, presuming
-Shakespeare did not actually write the play in this manner. . . .
-
-The answer is that they MAY have packed "liue" into a cliche at a
-time when they were out of "v"'s. . .possibly having used "vv" in
-place of some "w"'s, etc. This was a common practice of the day,
-as print was still quite expensive, and they didn't want to spend
-more on a wider selection of characters than they had to.
-
-You will find a lot of these kinds of "errors" in this text, as I
-have mentioned in other times and places, many "scholars" have an
-extreme attachment to these errors, and many have accorded them a
-very high place in the "canon" of Shakespeare. My father read an
-assortment of these made available to him by Cambridge University
-in England for several months in a glass room constructed for the
-purpose. To the best of my knowledge he read ALL those available
-. . .in great detail. . .and determined from the various changes,
-that Shakespeare most likely did not write in nearly as many of a
-variety of errors we credit him for, even though he was in/famous
-for signing his name with several different spellings.
-
-So, please take this into account when reading the comments below
-made by our volunteer who prepared this file: you may see errors
-that are "not" errors. . . .
-
-So. . .with this caveat. . .we have NOT changed the canon errors,
-here is the Project Gutenberg Etext of Shakespeare's The Tragedie
-of Macbeth.
-
-Michael S. Hart
-Project Gutenberg
-Executive Director
-
-
-***
-
-
-Scanner's Notes: What this is and isn't. This was taken from
-a copy of Shakespeare's first folio and it is as close as I can
-come in ASCII to the printed text.
-
-The elongated S's have been changed to small s's and the
-conjoined ae have been changed to ae. I have left the spelling,
-punctuation, capitalization as close as possible to the
-printed text. I have corrected some spelling mistakes (I have put
-together a spelling dictionary devised from the spellings of the
-Geneva Bible and Shakespeare's First Folio and have unified
-spellings according to this template), typo's and expanded
-abbreviations as I have come across them. Everything within
-brackets [] is what I have added. So if you don't like that
-you can delete everything within the brackets if you want a
-purer Shakespeare.
-
-Another thing that you should be aware of is that there are textual
-differences between various copies of the first folio. So there may
-be differences (other than what I have mentioned above) between
-this and other first folio editions. This is due to the printer's
-habit of setting the type and running off a number of copies and
-then proofing the printed copy and correcting the type and then
-continuing the printing run. The proof run wasn't thrown away but
-incorporated into the printed copies. This is just the way it is.
-The text I have used was a composite of more than 30 different
-First Folio editions' best pages.
-
-If you find any scanning errors, out and out typos, punctuation
-errors, or if you disagree with my spelling choices please feel
-free to email me those errors. I wish to make this the best
-etext possible. My email address for right now are haradda@aol.com
-and davidr@inconnect.com. I hope that you enjoy this.
-
-David Reed
-
-The Tragedie of Macbeth
-
-Actus Primus. Scoena Prima.
-
-Thunder and Lightning. Enter three Witches.
-
- 1. When shall we three meet againe?
-In Thunder, Lightning, or in Raine?
- 2. When the Hurley-burley's done,
-When the Battaile's lost, and wonne
-
- 3. That will be ere the set of Sunne
-
- 1. Where the place?
- 2. Vpon the Heath
-
- 3. There to meet with Macbeth
-
- 1. I come, Gray-Malkin
-
- All. Padock calls anon: faire is foule, and foule is faire,
-Houer through the fogge and filthie ayre.
-
-Exeunt.
-
-
-Scena Secunda.
-
-Alarum within. Enter King Malcome, Donalbaine, Lenox, with
-attendants,
-meeting a bleeding Captaine.
-
- King. What bloody man is that? he can report,
-As seemeth by his plight, of the Reuolt
-The newest state
-
- Mal. This is the Serieant,
-Who like a good and hardie Souldier fought
-'Gainst my Captiuitie: Haile braue friend;
-Say to the King, the knowledge of the Broyle,
-As thou didst leaue it
-
- Cap. Doubtfull it stood,
-As two spent Swimmers, that doe cling together,
-And choake their Art: The mercilesse Macdonwald
-(Worthie to be a Rebell, for to that
-The multiplying Villanies of Nature
-Doe swarme vpon him) from the Westerne Isles
-Of Kernes and Gallowgrosses is supply'd,
-And Fortune on his damned Quarry smiling,
-Shew'd like a Rebells Whore: but all's too weake:
-For braue Macbeth (well hee deserues that Name)
-Disdayning Fortune, with his brandisht Steele,
-Which smoak'd with bloody execution
-(Like Valours Minion) caru'd out his passage,
-Till hee fac'd the Slaue:
-Which neu'r shooke hands, nor bad farwell to him,
-Till he vnseam'd him from the Naue toth' Chops,
-And fix'd his Head vpon our Battlements
-
- King. O valiant Cousin, worthy Gentleman
-
- Cap. As whence the Sunne 'gins his reflection,
-Shipwracking Stormes, and direfull Thunders:
-So from that Spring, whence comfort seem'd to come,
-Discomfort swells: Marke King of Scotland, marke,
-No sooner Iustice had, with Valour arm'd,
-Compell'd these skipping Kernes to trust their heeles,
-But the Norweyan Lord, surueying vantage,
-With furbusht Armes, and new supplyes of men,
-Began a fresh assault
-
- King. Dismay'd not this our Captaines, Macbeth and
-Banquoh?
- Cap. Yes, as Sparrowes, Eagles;
-Or the Hare, the Lyon:
-If I say sooth, I must report they were
-As Cannons ouer-charg'd with double Cracks,
-So they doubly redoubled stroakes vpon the Foe:
-Except they meant to bathe in reeking Wounds,
-Or memorize another Golgotha,
-I cannot tell: but I am faint,
-My Gashes cry for helpe
-
- King. So well thy words become thee, as thy wounds,
-They smack of Honor both: Goe get him Surgeons.
-Enter Rosse and Angus.
-
-Who comes here?
- Mal. The worthy Thane of Rosse
-
- Lenox. What a haste lookes through his eyes?
-So should he looke, that seemes to speake things strange
-
- Rosse. God saue the King
-
- King. Whence cam'st thou, worthy Thane?
- Rosse. From Fiffe, great King,
-Where the Norweyan Banners flowt the Skie,
-And fanne our people cold.
-Norway himselfe, with terrible numbers,
-Assisted by that most disloyall Traytor,
-The Thane of Cawdor, began a dismall Conflict,
-Till that Bellona's Bridegroome, lapt in proofe,
-Confronted him with selfe-comparisons,
-Point against Point, rebellious Arme 'gainst Arme,
-Curbing his lauish spirit: and to conclude,
-The Victorie fell on vs
-
- King. Great happinesse
-
- Rosse. That now Sweno, the Norwayes King,
-Craues composition:
-Nor would we deigne him buriall of his men,
-Till he disbursed, at Saint Colmes ynch,
-Ten thousand Dollars, to our generall vse
-
- King. No more that Thane of Cawdor shall deceiue
-Our Bosome interest: Goe pronounce his present death,
-And with his former Title greet Macbeth
-
- Rosse. Ile see it done
-
- King. What he hath lost, Noble Macbeth hath wonne.
-
-Exeunt.
-
-
-Scena Tertia.
-
-Thunder. Enter the three Witches.
-
- 1. Where hast thou beene, Sister?
- 2. Killing Swine
-
- 3. Sister, where thou?
- 1. A Saylors Wife had Chestnuts in her Lappe,
-And mouncht, & mouncht, and mouncht:
-Giue me, quoth I.
-Aroynt thee, Witch, the rumpe-fed Ronyon cryes.
-Her Husband's to Aleppo gone, Master o'th' Tiger:
-But in a Syue Ile thither sayle,
-And like a Rat without a tayle,
-Ile doe, Ile doe, and Ile doe
-
- 2. Ile giue thee a Winde
-
- 1. Th'art kinde
-
- 3. And I another
-
- 1. I my selfe haue all the other,
-And the very Ports they blow,
-All the Quarters that they know,
-I'th' Ship-mans Card.
-Ile dreyne him drie as Hay:
-Sleepe shall neyther Night nor Day
-Hang vpon his Pent-house Lid:
-He shall liue a man forbid:
-Wearie Seu'nights, nine times nine,
-Shall he dwindle, peake, and pine:
-Though his Barke cannot be lost,
-Yet it shall be Tempest-tost.
-Looke what I haue
-
- 2. Shew me, shew me
-
- 1. Here I haue a Pilots Thumbe,
-Wrackt, as homeward he did come.
-
-Drum within.
-
- 3. A Drumme, a Drumme:
-Macbeth doth come
-
- All. The weyward Sisters, hand in hand,
-Posters of the Sea and Land,
-Thus doe goe, about, about,
-Thrice to thine, and thrice to mine,
-And thrice againe, to make vp nine.
-Peace, the Charme's wound vp.
-Enter Macbeth and Banquo.
-
- Macb. So foule and faire a day I haue not seene
-
- Banquo. How farre is't call'd to Soris? What are these,
-So wither'd, and so wilde in their attyre,
-That looke not like th' Inhabitants o'th' Earth,
-And yet are on't? Liue you, or are you aught
-That man may question? you seeme to vnderstand me,
-By each at once her choppie finger laying
-Vpon her skinnie Lips: you should be Women,
-And yet your Beards forbid me to interprete
-That you are so
-
- Mac. Speake if you can: what are you?
- 1. All haile Macbeth, haile to thee Thane of Glamis
-
- 2. All haile Macbeth, haile to thee Thane of Cawdor
-
- 3. All haile Macbeth, that shalt be King hereafter
-
- Banq. Good Sir, why doe you start, and seeme to feare
-Things that doe sound so faire? i'th' name of truth
-Are ye fantasticall, or that indeed
-Which outwardly ye shew? My Noble Partner
-You greet with present Grace, and great prediction
-Of Noble hauing, and of Royall hope,
-That he seemes wrapt withall: to me you speake not.
-If you can looke into the Seedes of Time,
-And say, which Graine will grow, and which will not,
-Speake then to me, who neyther begge, nor feare
-Your fauors, nor your hate
-
- 1. Hayle
-
- 2. Hayle
-
- 3. Hayle
-
- 1. Lesser than Macbeth, and greater
-
- 2. Not so happy, yet much happyer
-
- 3. Thou shalt get Kings, though thou be none:
-So all haile Macbeth, and Banquo
-
- 1. Banquo, and Macbeth, all haile
-
- Macb. Stay you imperfect Speakers, tell me more:
-By Sinells death, I know I am Thane of Glamis,
-But how, of Cawdor? the Thane of Cawdor liues
-A prosperous Gentleman: And to be King,
-Stands not within the prospect of beleefe,
-No more then to be Cawdor. Say from whence
-You owe this strange Intelligence, or why
-Vpon this blasted Heath you stop our way
-With such Prophetique greeting?
-Speake, I charge you.
-
-Witches vanish.
-
- Banq. The Earth hath bubbles, as the Water ha's,
-And these are of them: whither are they vanish'd?
- Macb. Into the Ayre: and what seem'd corporall,
-Melted, as breath into the Winde.
-Would they had stay'd
-
- Banq. Were such things here, as we doe speake about?
-Or haue we eaten on the insane Root,
-That takes the Reason Prisoner?
- Macb. Your Children shall be Kings
-
- Banq. You shall be King
-
- Macb. And Thane of Cawdor too: went it not so?
- Banq. Toth' selfe-same tune and words: who's here?
-Enter Rosse and Angus.
-
- Rosse. The King hath happily receiu'd, Macbeth,
-The newes of thy successe: and when he reades
-Thy personall Venture in the Rebels sight,
-His Wonders and his Prayses doe contend,
-Which should be thine, or his: silenc'd with that,
-In viewing o're the rest o'th' selfe-same day,
-He findes thee in the stout Norweyan Rankes,
-Nothing afeard of what thy selfe didst make
-Strange Images of death, as thick as Tale
-Can post with post, and euery one did beare
-Thy prayses in his Kingdomes great defence,
-And powr'd them downe before him
-
- Ang. Wee are sent,
-To giue thee from our Royall Master thanks,
-Onely to harrold thee into his sight,
-Not pay thee
-
- Rosse. And for an earnest of a greater Honor,
-He bad me, from him, call thee Thane of Cawdor:
-In which addition, haile most worthy Thane,
-For it is thine
-
- Banq. What, can the Deuill speake true?
- Macb. The Thane of Cawdor liues:
-Why doe you dresse me in borrowed Robes?
- Ang. Who was the Thane, liues yet,
-But vnder heauie Iudgement beares that Life,
-Which he deserues to loose.
-Whether he was combin'd with those of Norway,
-Or did lyne the Rebell with hidden helpe,
-And vantage; or that with both he labour'd
-In his Countreyes wracke, I know not:
-But Treasons Capitall, confess'd, and prou'd,
-Haue ouerthrowne him
-
- Macb. Glamys, and Thane of Cawdor:
-The greatest is behinde. Thankes for your paines.
-Doe you not hope your Children shall be Kings,
-When those that gaue the Thane of Cawdor to me,
-Promis'd no lesse to them
-
- Banq. That trusted home,
-Might yet enkindle you vnto the Crowne,
-Besides the Thane of Cawdor. But 'tis strange:
-And oftentimes, to winne vs to our harme,
-The Instruments of Darknesse tell vs Truths,
-Winne vs with honest Trifles, to betray's
-In deepest consequence.
-Cousins, a word, I pray you
-
- Macb. Two Truths are told,
-As happy Prologues to the swelling Act
-Of the Imperiall Theame. I thanke you Gentlemen:
-This supernaturall solliciting
-Cannot be ill; cannot be good.
-If ill? why hath it giuen me earnest of successe,
-Commencing in a Truth? I am Thane of Cawdor.
-If good? why doe I yeeld to that suggestion,
-Whose horrid Image doth vnfixe my Heire,
-And make my seated Heart knock at my Ribbes,
-Against the vse of Nature? Present Feares
-Are lesse then horrible Imaginings:
-My Thought, whose Murther yet is but fantasticall,
-Shakes so my single state of Man,
-That Function is smother'd in surmise,
-And nothing is, but what is not
-
- Banq. Looke how our Partner's rapt
-
- Macb. If Chance will haue me King,
-Why Chance may Crowne me,
-Without my stirre
-
- Banq. New Honors come vpon him
-Like our strange Garments, cleaue not to their mould,
-But with the aid of vse
-
- Macb. Come what come may,
-Time, and the Houre, runs through the roughest Day
-
- Banq. Worthy Macbeth, wee stay vpon your leysure
-
- Macb. Giue me your fauour:
-My dull Braine was wrought with things forgotten.
-Kinde Gentlemen, your paines are registred,
-Where euery day I turne the Leafe,
-To reade them.
-Let vs toward the King: thinke vpon
-What hath chanc'd: and at more time,
-The Interim hauing weigh'd it, let vs speake
-Our free Hearts each to other
-
- Banq. Very gladly
-
- Macb. Till then enough:
-Come friends.
-
-Exeunt.
-
-
-Scena Quarta.
-
-Flourish. Enter King, Lenox, Malcolme, Donalbaine, and
-Attendants.
-
- King. Is execution done on Cawdor?
-Or not those in Commission yet return'd?
- Mal. My Liege, they are not yet come back.
-But I haue spoke with one that saw him die:
-Who did report, that very frankly hee
-Confess'd his Treasons, implor'd your Highnesse Pardon,
-And set forth a deepe Repentance:
-Nothing in his Life became him,
-Like the leauing it. Hee dy'de,
-As one that had beene studied in his death,
-To throw away the dearest thing he ow'd,
-As 'twere a carelesse Trifle
-
- King. There's no Art,
-To finde the Mindes construction in the Face.
-He was a Gentleman, on whom I built
-An absolute Trust.
-Enter Macbeth, Banquo, Rosse, and Angus.
-
-O worthyest Cousin,
-The sinne of my Ingratitude euen now
-Was heauie on me. Thou art so farre before,
-That swiftest Wing of Recompence is slow,
-To ouertake thee. Would thou hadst lesse deseru'd,
-That the proportion both of thanks, and payment,
-Might haue beene mine: onely I haue left to say,
-More is thy due, then more then all can pay
-
- Macb. The seruice, and the loyaltie I owe,
-In doing it, payes it selfe.
-Your Highnesse part, is to receiue our Duties:
-And our Duties are to your Throne, and State,
-Children, and Seruants; which doe but what they should,
-By doing euery thing safe toward your Loue
-And Honor
-
- King. Welcome hither:
-I haue begun to plant thee, and will labour
-To make thee full of growing. Noble Banquo,
-That hast no lesse deseru'd, nor must be knowne
-No lesse to haue done so: Let me enfold thee,
-And hold thee to my Heart
-
- Banq. There if I grow,
-The Haruest is your owne
-
- King. My plenteous Ioyes,
-Wanton in fulnesse, seeke to hide themselues
-In drops of sorrow. Sonnes, Kinsmen, Thanes,
-And you whose places are the nearest, know,
-We will establish our Estate vpon
-Our eldest, Malcolme, whom we name hereafter,
-The Prince of Cumberland: which Honor must
-Not vnaccompanied, inuest him onely,
-But signes of Noblenesse, like Starres, shall shine
-On all deseruers. From hence to Envernes,
-And binde vs further to you
-
- Macb. The Rest is Labor, which is not vs'd for you:
-Ile be my selfe the Herbenger, and make ioyfull
-The hearing of my Wife, with your approach:
-So humbly take my leaue
-
- King. My worthy Cawdor
-
- Macb. The Prince of Cumberland: that is a step,
-On which I must fall downe, or else o're-leape,
-For in my way it lyes. Starres hide your fires,
-Let not Light see my black and deepe desires:
-The Eye winke at the Hand: yet let that bee,
-Which the Eye feares, when it is done to see.
-Enter.
-
- King. True worthy Banquo: he is full so valiant,
-And in his commendations, I am fed:
-It is a Banquet to me. Let's after him,
-Whose care is gone before, to bid vs welcome:
-It is a peerelesse Kinsman.
-
-Flourish. Exeunt.
-
-
-Scena Quinta.
-
-Enter Macbeths Wife alone with a Letter.
-
- Lady. They met me in the day of successe: and I haue
-learn'd by the perfect'st report, they haue more in them, then
-mortall knowledge. When I burnt in desire to question them
-further, they made themselues Ayre, into which they vanish'd.
-Whiles I stood rapt in the wonder of it, came Missiues from
-the King, who all-hail'd me Thane of Cawdor, by which Title
-before, these weyward Sisters saluted me, and referr'd me to
-the comming on of time, with haile King that shalt be. This
-haue I thought good to deliuer thee (my dearest Partner of
-Greatnesse) that thou might'st not loose the dues of reioycing
-by being ignorant of what Greatnesse is promis'd thee. Lay
-it to thy heart and farewell.
-Glamys thou art, and Cawdor, and shalt be
-What thou art promis'd: yet doe I feare thy Nature,
-It is too full o'th' Milke of humane kindnesse,
-To catch the neerest way. Thou would'st be great,
-Art not without Ambition, but without
-The illnesse should attend it. What thou would'st highly,
-That would'st thou holily: would'st not play false,
-And yet would'st wrongly winne.
-Thould'st haue, great Glamys, that which cryes,
-Thus thou must doe, if thou haue it;
-And that which rather thou do'st feare to doe,
-Then wishest should be vndone. High thee hither,
-That I may powre my Spirits in thine Eare,
-And chastise with the valour of my Tongue
-All that impeides thee from the Golden Round,
-Which Fate and Metaphysicall ayde doth seeme
-To haue thee crown'd withall.
-Enter Messenger.
-
-What is your tidings?
- Mess. The King comes here to Night
-
- Lady. Thou'rt mad to say it.
-Is not thy Master with him? who, wer't so,
-Would haue inform'd for preparation
-
- Mess. So please you, it is true: our Thane is comming:
-One of my fellowes had the speed of him;
-Who almost dead for breath, had scarcely more
-Then would make vp his Message
-
- Lady. Giue him tending,
-He brings great newes,
-
-Exit Messenger.
-
-The Rauen himselfe is hoarse,
-That croakes the fatall entrance of Duncan
-Vnder my Battlements. Come you Spirits,
-That tend on mortall thoughts, vnsex me here,
-And fill me from the Crowne to the Toe, top-full
-Of direst Crueltie: make thick my blood,
-Stop vp th' accesse, and passage to Remorse,
-That no compunctious visitings of Nature
-Shake my fell purpose, nor keepe peace betweene
-Th' effect, and hit. Come to my Womans Brests,
-And take my Milke for Gall, you murth'ring Ministers,
-Where-euer, in your sightlesse substances,
-You wait on Natures Mischiefe. Come thick Night,
-And pall thee in the dunnest smoake of Hell,
-
-That my keene Knife see not the Wound it makes,
-Nor Heauen peepe through the Blanket of the darke,
-To cry, hold, hold.
-Enter Macbeth.
-
-Great Glamys, worthy Cawdor,
-Greater then both, by the all-haile hereafter,
-Thy Letters haue transported me beyond
-This ignorant present, and I feele now
-The future in the instant
-
- Macb. My dearest Loue,
-Duncan comes here to Night
-
- Lady. And when goes hence?
- Macb. To morrow, as he purposes
-
- Lady. O neuer,
-Shall Sunne that Morrow see.
-Your Face, my Thane, is as a Booke, where men
-May reade strange matters, to beguile the time.
-Looke like the time, beare welcome in your Eye,
-Your Hand, your Tongue: looke like th' innocent flower,
-But be the Serpent vnder't. He that's comming,
-Must be prouided for: and you shall put
-This Nights great Businesse into my dispatch,
-Which shall to all our Nights, and Dayes to come,
-Giue solely soueraigne sway, and Masterdome
-
- Macb. We will speake further,
- Lady. Onely looke vp cleare:
-To alter fauor, euer is to feare:
-Leaue all the rest to me.
-
-Exeunt.
-
-
-Scena Sexta.
-
-Hoboyes, and Torches. Enter King, Malcolme, Donalbaine,
-Banquo, Lenox,
-Macduff, Rosse, Angus, and Attendants.
-
- King. This Castle hath a pleasant seat,
-The ayre nimbly and sweetly recommends it selfe
-Vnto our gentle sences
-
- Banq. This Guest of Summer,
-The Temple-haunting Barlet does approue,
-By his loued Mansonry, that the Heauens breath
-Smells wooingly here: no Iutty frieze,
-Buttrice, nor Coigne of Vantage, but this Bird
-Hath made his pendant Bed, and procreant Cradle,
-Where they must breed, and haunt: I haue obseru'd
-The ayre is delicate.
-Enter Lady.
-
- King. See, see our honor'd Hostesse:
-The Loue that followes vs, sometime is our trouble,
-Which still we thanke as Loue. Herein I teach you,
-How you shall bid God-eyld vs for your paines,
-And thanke vs for your trouble
-
- Lady. All our seruice,
-In euery point twice done, and then done double,
-Were poore, and single Businesse, to contend
-Against those Honors deepe, and broad,
-Wherewith your Maiestie loades our House:
-For those of old, and the late Dignities,
-Heap'd vp to them, we rest your Ermites
-
- King. Where's the Thane of Cawdor?
-We courst him at the heeles, and had a purpose
-To be his Purueyor: But he rides well,
-And his great Loue (sharpe as his Spurre) hath holp him
-To his home before vs: Faire and Noble Hostesse
-We are your guest to night
-
- La. Your Seruants euer,
-Haue theirs, themselues, and what is theirs in compt,
-To make their Audit at your Highnesse pleasure,
-Still to returne your owne
-
- King. Giue me your hand:
-Conduct me to mine Host we loue him highly,
-And shall continue, our Graces towards him.
-By your leaue Hostesse.
-
-Exeunt.
-
-Scena Septima.
-
-Hoboyes. Torches. Enter a Sewer, and diuers Seruants with Dishes
-and
-Seruice ouer the Stage. Then enter Macbeth
-
- Macb. If it were done, when 'tis done, then 'twer well,
-It were done quickly: If th' Assassination
-Could trammell vp the Consequence, and catch
-With his surcease, Successe: that but this blow
-Might be the be all, and the end all. Heere,
-But heere, vpon this Banke and Schoole of time,
-Wee'ld iumpe the life to come. But in these Cases,
-We still haue iudgement heere, that we but teach
-Bloody Instructions, which being taught, returne
-To plague th' Inuenter, this euen-handed Iustice
-Commends th' Ingredience of our poyson'd Challice
-To our owne lips. Hee's heere in double trust;
-First, as I am his Kinsman, and his Subiect,
-Strong both against the Deed: Then, as his Host,
-Who should against his Murtherer shut the doore,
-Not beare the knife my selfe. Besides, this Duncane
-Hath borne his Faculties so meeke; hath bin
-So cleere in his great Office, that his Vertues
-Will pleade like Angels, Trumpet-tongu'd against
-The deepe damnation of his taking off:
-And Pitty, like a naked New-borne-Babe,
-Striding the blast, or Heauens Cherubin, hors'd
-Vpon the sightlesse Curriors of the Ayre,
-Shall blow the horrid deed in euery eye,
-That teares shall drowne the winde. I haue no Spurre
-To pricke the sides of my intent, but onely
-Vaulting Ambition, which ore-leapes it selfe,
-And falles on th' other.
-Enter Lady.
-
-How now? What Newes?
- La. He has almost supt: why haue you left the chamber?
- Mac. Hath he ask'd for me?
- La. Know you not, he ha's?
- Mac. We will proceed no further in this Businesse:
-He hath Honour'd me of late, and I haue bought
-Golden Opinions from all sorts of people,
-Which would be worne now in their newest glosse,
-Not cast aside so soone
-
- La. Was the hope drunke,
-Wherein you drest your selfe? Hath it slept since?
-And wakes it now to looke so greene, and pale,
-At what it did so freely? From this time,
-Such I account thy loue. Art thou affear'd
-To be the same in thine owne Act, and Valour,
-As thou art in desire? Would'st thou haue that
-Which thou esteem'st the Ornament of Life,
-And liue a Coward in thine owne Esteeme?
-Letting I dare not, wait vpon I would,
-Like the poore Cat i'th' Addage
-
- Macb. Prythee peace:
-I dare do all that may become a man,
-Who dares do more, is none
-
- La. What Beast was't then
-That made you breake this enterprize to me?
-When you durst do it, then you were a man:
-And to be more then what you were, you would
-Be so much more the man. Nor time, nor place
-Did then adhere, and yet you would make both:
-They haue made themselues, and that their fitnesse now
-Do's vnmake you. I haue giuen Sucke, and know
-How tender 'tis to loue the Babe that milkes me,
-I would, while it was smyling in my Face,
-Haue pluckt my Nipple from his Bonelesse Gummes,
-And dasht the Braines out, had I so sworne
-As you haue done to this
-
- Macb. If we should faile?
- Lady. We faile?
-But screw your courage to the sticking place,
-And wee'le not fayle: when Duncan is asleepe,
-(Whereto the rather shall his dayes hard Iourney
-Soundly inuite him) his two Chamberlaines
-Will I with Wine, and Wassell, so conuince,
-That Memorie, the Warder of the Braine,
-Shall be a Fume, and the Receit of Reason
-A Lymbeck onely: when in Swinish sleepe,
-Their drenched Natures lyes as in a Death,
-What cannot you and I performe vpon
-Th' vnguarded Duncan? What not put vpon
-His spungie Officers? who shall beare the guilt
-Of our great quell
-
- Macb. Bring forth Men-Children onely:
-For thy vndaunted Mettle should compose
-Nothing but Males. Will it not be receiu'd,
-When we haue mark'd with blood those sleepie two
-Of his owne Chamber, and vs'd their very Daggers,
-That they haue don't?
- Lady. Who dares receiue it other,
-As we shall make our Griefes and Clamor rore,
-Vpon his Death?
- Macb. I am settled, and bend vp
-Each corporall Agent to this terrible Feat.
-Away, and mock the time with fairest show,
-False Face must hide what the false Heart doth know.
-
-Exeunt.
-
-
-Actus Secundus. Scena Prima.
-
-Enter Banquo, and Fleance, with a Torch before him.
-
- Banq. How goes the Night, Boy?
- Fleance. The Moone is downe: I haue not heard the
-Clock
-
- Banq. And she goes downe at Twelue
-
- Fleance. I take't, 'tis later, Sir
-
- Banq. Hold, take my Sword:
-There's Husbandry in Heauen,
-Their Candles are all out: take thee that too.
-A heauie Summons lyes like Lead vpon me,
-And yet I would not sleepe:
-Mercifull Powers, restraine in me the cursed thoughts
-That Nature giues way to in repose.
-Enter Macbeth, and a Seruant with a Torch.
-
-Giue me my Sword: who's there?
- Macb. A Friend
-
- Banq. What Sir, not yet at rest? the King's a bed.
-He hath beene in vnusuall Pleasure,
-And sent forth great Largesse to your Offices.
-This Diamond he greetes your Wife withall,
-By the name of most kind Hostesse,
-And shut vp in measurelesse content
-
- Mac. Being vnprepar'd,
-Our will became the seruant to defect,
-Which else should free haue wrought
-
- Banq. All's well.
-I dreamt last Night of the three weyward Sisters:
-To you they haue shew'd some truth
-
- Macb. I thinke not of them:
-Yet when we can entreat an houre to serue,
-We would spend it in some words vpon that Businesse,
-If you would graunt the time
-
- Banq. At your kind'st leysure
-
- Macb. If you shall cleaue to my consent,
-When 'tis, it shall make Honor for you
-
- Banq. So I lose none,
-In seeking to augment it, but still keepe
-My Bosome franchis'd, and Allegeance cleare,
-I shall be counsail'd
-
- Macb. Good repose the while
-
- Banq. Thankes Sir: the like to you.
-
-Exit Banquo.
-
- Macb. Goe bid thy Mistresse, when my drinke is ready,
-She strike vpon the Bell. Get thee to bed.
-Enter.
-
-Is this a Dagger, which I see before me,
-The Handle toward my Hand? Come, let me clutch thee:
-I haue thee not, and yet I see thee still.
-Art thou not fatall Vision, sensible
-To feeling, as to sight? or art thou but
-A Dagger of the Minde, a false Creation,
-Proceeding from the heat-oppressed Braine?
-I see thee yet, in forme as palpable,
-As this which now I draw.
-Thou marshall'st me the way that I was going,
-And such an Instrument I was to vse.
-Mine Eyes are made the fooles o'th' other Sences,
-Or else worth all the rest: I see thee still;
-And on thy Blade, and Dudgeon, Gouts of Blood,
-Which was not so before. There's no such thing:
-It is the bloody Businesse, which informes
-Thus to mine Eyes. Now o're the one halfe World
-Nature seemes dead, and wicked Dreames abuse
-The Curtain'd sleepe: Witchcraft celebrates
-Pale Heccats Offrings: and wither'd Murther,
-Alarum'd by his Centinell, the Wolfe,
-Whose howle's his Watch, thus with his stealthy pace,
-With Tarquins rauishing sides, towards his designe
-Moues like a Ghost. Thou sowre and firme-set Earth
-Heare not my steps, which they may walke, for feare
-Thy very stones prate of my where-about,
-And take the present horror from the time,
-Which now sutes with it. Whiles I threat, he liues:
-Words to the heat of deedes too cold breath giues.
-
-A Bell rings.
-
-I goe, and it is done: the Bell inuites me.
-Heare it not, Duncan, for it is a Knell,
-That summons thee to Heauen, or to Hell.
-Enter.
-
-
-Scena Secunda.
-
-Enter Lady.
-
- La. That which hath made the[m] drunk, hath made me bold:
-What hath quench'd them, hath giuen me fire.
-Hearke, peace: it was the Owle that shriek'd,
-The fatall Bell-man, which giues the stern'st good-night.
-He is about it, the Doores are open:
-And the surfeted Groomes doe mock their charge
-With Snores. I haue drugg'd their Possets,
-That Death and Nature doe contend about them,
-Whether they liue, or dye.
-Enter Macbeth.
-
- Macb. Who's there? what hoa?
- Lady. Alack, I am afraid they haue awak'd,
-And 'tis not done: th' attempt, and not the deed,
-Confounds vs: hearke: I lay'd their Daggers ready,
-He could not misse 'em. Had he not resembled
-My Father as he slept, I had don't.
-My Husband?
- Macb. I haue done the deed:
-Didst thou not heare a noyse?
- Lady. I heard the Owle schreame, and the Crickets cry.
-Did not you speake?
- Macb. When?
- Lady. Now
-
- Macb. As I descended?
- Lady. I
-
- Macb. Hearke, who lyes i'th' second Chamber?
- Lady. Donalbaine
-
- Mac. This is a sorry sight
-
- Lady. A foolish thought, to say a sorry sight
-
- Macb. There's one did laugh in's sleepe,
-And one cry'd Murther, that they did wake each other:
-I stood, and heard them: But they did say their Prayers,
-And addrest them againe to sleepe
-
- Lady. There are two lodg'd together
-
- Macb. One cry'd God blesse vs, and Amen the other,
-As they had seene me with these Hangmans hands:
-Listning their feare, I could not say Amen,
-When they did say God blesse vs
-
- Lady. Consider it not so deepely
-
- Mac. But wherefore could not I pronounce Amen?
-I had most need of Blessing, and Amen stuck in my throat
-
- Lady. These deeds must not be thought
-After these wayes: so, it will make vs mad
-
- Macb. Me thought I heard a voyce cry, Sleep no more:
-Macbeth does murther Sleepe, the innocent Sleepe,
-Sleepe that knits vp the rauel'd Sleeue of Care,
-The death of each dayes Life, sore Labors Bath,
-Balme of hurt Mindes, great Natures second Course,
-Chiefe nourisher in Life's Feast
-
- Lady. What doe you meane?
- Macb. Still it cry'd, Sleepe no more to all the House:
-Glamis hath murther'd Sleepe, and therefore Cawdor
-Shall sleepe no more: Macbeth shall sleepe no more
-
- Lady. Who was it, that thus cry'd? why worthy Thane,
-You doe vnbend your Noble strength, to thinke
-So braine-sickly of things: Goe get some Water,
-And wash this filthie Witnesse from your Hand.
-Why did you bring these Daggers from the place?
-They must lye there: goe carry them, and smeare
-The sleepie Groomes with blood
-
- Macb. Ile goe no more:
-I am afraid, to thinke what I haue done:
-Looke on't againe, I dare not
-
- Lady. Infirme of purpose:
-Giue me the Daggers: the sleeping, and the dead,
-Are but as Pictures: 'tis the Eye of Childhood,
-That feares a painted Deuill. If he doe bleed,
-Ile guild the Faces of the Groomes withall,
-For it must seeme their Guilt.
-Enter.
-
-Knocke within.
-
- Macb. Whence is that knocking?
-How is't with me, when euery noyse appalls me?
-What Hands are here? hah: they pluck out mine Eyes.
-Will all great Neptunes Ocean wash this blood
-Cleane from my Hand? no: this my Hand will rather
-The multitudinous Seas incarnardine,
-Making the Greene one, Red.
-Enter Lady.
-
- Lady. My Hands are of your colour: but I shame
-To weare a Heart so white.
-
-Knocke.
-
-I heare a knocking at the South entry:
-Retyre we to our Chamber:
-A little Water cleares vs of this deed.
-How easie is it then? your Constancie
-Hath left you vnattended.
-
-Knocke.
-
-Hearke, more knocking.
-Get on your Night-Gowne, least occasion call vs,
-And shew vs to be Watchers: be not lost
-So poorely in your thoughts
-
- Macb. To know my deed,
-
-Knocke.
-
-'Twere best not know my selfe.
-Wake Duncan with thy knocking:
-I would thou could'st.
-
-Exeunt.
-
-
-Scena Tertia.
-
-Enter a Porter. Knocking within.
-
- Porter. Here's a knocking indeede: if a man were
-Porter of Hell Gate, hee should haue old turning the
-Key.
-
-Knock.
-
-Knock, Knock, Knock. Who's there
-i'th' name of Belzebub? Here's a Farmer, that hang'd
-himselfe on th' expectation of Plentie: Come in time, haue
-Napkins enow about you, here you'le sweat for't.
-
-Knock.
-
-Knock, knock. Who's there in th' other Deuils Name?
-Faith here's an Equiuocator, that could sweare in both
-the Scales against eyther Scale, who committed Treason
-enough for Gods sake, yet could not equiuocate to Heauen:
-oh come in, Equiuocator.
-
-Knock.
-
-Knock, Knock, Knock. Who's there? 'Faith here's an English
-Taylor come hither, for stealing out of a French Hose:
-Come in Taylor, here you may rost your Goose.
-Knock.
-
-Knock, Knock. Neuer at quiet: What are you? but this
-place is too cold for Hell. Ile Deuill-Porter it no further:
-I had thought to haue let in some of all Professions, that
-goe the Primrose way to th' euerlasting Bonfire.
-
-Knock.
-
-Anon, anon, I pray you remember the Porter.
-Enter Macduff, and Lenox.
-
- Macd. Was it so late, friend, ere you went to Bed,
-That you doe lye so late?
- Port. Faith Sir, we were carowsing till the second Cock:
-And Drinke, Sir, is a great prouoker of three things
-
- Macd. What three things does Drinke especially
-prouoke?
- Port. Marry, Sir, Nose-painting, Sleepe, and Vrine.
-Lecherie, Sir, it prouokes, and vnprouokes: it prouokes
-the desire, but it takes away the performance. Therefore
-much Drinke may be said to be an Equiuocator with Lecherie:
-it makes him, and it marres him; it sets him on,
-and it takes him off; it perswades him, and dis-heartens
-him; makes him stand too, and not stand too: in conclusion,
-equiuocates him in a sleepe, and giuing him the Lye,
-leaues him
-
- Macd. I beleeue, Drinke gaue thee the Lye last Night
-
- Port. That it did, Sir, i'the very Throat on me: but I
-requited him for his Lye, and (I thinke) being too strong
-for him, though he tooke vp my Legges sometime, yet I
-made a Shift to cast him.
-Enter Macbeth.
-
- Macd. Is thy Master stirring?
-Our knocking ha's awak'd him: here he comes
-
- Lenox. Good morrow, Noble Sir
-
- Macb. Good morrow both
-
- Macd. Is the King stirring, worthy Thane?
- Macb. Not yet
-
- Macd. He did command me to call timely on him,
-I haue almost slipt the houre
-
- Macb. Ile bring you to him
-
- Macd. I know this is a ioyfull trouble to you:
-But yet 'tis one
-
- Macb. The labour we delight in, Physicks paine:
-This is the Doore
-
- Macd. Ile make so bold to call, for 'tis my limitted
-seruice.
-
-Exit Macduffe.
-
- Lenox. Goes the King hence to day?
- Macb. He does: he did appoint so
-
- Lenox. The Night ha's been vnruly:
-Where we lay, our Chimneys were blowne downe,
-And (as they say) lamentings heard i'th' Ayre;
-Strange Schreemes of Death,
-And Prophecying, with Accents terrible,
-Of dyre Combustion, and confus'd Euents,
-New hatch'd toth' wofull time.
-The obscure Bird clamor'd the liue-long Night.
-Some say, the Earth was Feuorous,
-And did shake
-
- Macb. 'Twas a rough Night
-
- Lenox. My young remembrance cannot paralell
-A fellow to it.
-Enter Macduff.
-
- Macd. O horror, horror, horror,
-Tongue nor Heart cannot conceiue, nor name thee
-
- Macb. and Lenox. What's the matter?
- Macd. Confusion now hath made his Master-peece:
-Most sacrilegious Murther hath broke ope
-The Lords anoynted Temple, and stole thence
-The Life o'th' Building
-
- Macb. What is't you say, the Life?
- Lenox. Meane you his Maiestie?
- Macd. Approch the Chamber, and destroy your sight
-With a new Gorgon. Doe not bid me speake:
-See, and then speake your selues: awake, awake,
-
-Exeunt. Macbeth and Lenox.
-
-Ring the Alarum Bell: Murther, and Treason,
-Banquo, and Donalbaine: Malcolme awake,
-Shake off this Downey sleepe, Deaths counterfeit,
-And looke on Death it selfe: vp, vp, and see
-The great Doomes Image: Malcolme, Banquo,
-As from your Graues rise vp, and walke like Sprights,
-To countenance this horror. Ring the Bell.
-
-Bell rings. Enter Lady.
-
- Lady. What's the Businesse?
-That such a hideous Trumpet calls to parley
-The sleepers of the House? speake, speake
-
- Macd. O gentle Lady,
-'Tis not for you to heare what I can speake:
-The repetition in a Womans eare,
-Would murther as it fell.
-Enter Banquo.
-
-O Banquo, Banquo, Our Royall Master's murther'd
-
- Lady. Woe, alas:
-What, in our House?
- Ban. Too cruell, any where.
-Deare Duff, I prythee contradict thy selfe,
-And say, it is not so.
-Enter Macbeth, Lenox, and Rosse.
-
- Macb. Had I but dy'd an houre before this chance,
-I had liu'd a blessed time: for from this instant,
-There's nothing serious in Mortalitie:
-All is but Toyes: Renowne and Grace is dead,
-The Wine of Life is drawne, and the meere Lees
-Is left this Vault, to brag of.
-Enter Malcolme and Donalbaine.
-
- Donal. What is amisse?
- Macb. You are, and doe not know't:
-The Spring, the Head, the Fountaine of your Blood
-Is stopt, the very Source of it is stopt
-
- Macd. Your Royall Father's murther'd
-
- Mal. Oh, by whom?
- Lenox. Those of his Chamber, as it seem'd, had don't:
-Their Hands and Faces were all badg'd with blood,
-So were their Daggers, which vnwip'd, we found
-Vpon their Pillowes: they star'd, and were distracted,
-No mans Life was to be trusted with them
-
- Macb. O, yet I doe repent me of my furie,
-That I did kill them
-
- Macd. Wherefore did you so?
- Macb. Who can be wise, amaz'd, temp'rate, & furious,
-Loyall, and Neutrall, in a moment? No man:
-Th' expedition of my violent Loue
-Out-run the pawser, Reason. Here lay Duncan,
-His Siluer skinne, lac'd with His Golden Blood,
-And his gash'd Stabs, look'd like a Breach in Nature,
-For Ruines wastfull entrance: there the Murtherers,
-Steep'd in the Colours of their Trade; their Daggers
-Vnmannerly breech'd with gore: who could refraine,
-That had a heart to loue; and in that heart,
-Courage, to make's loue knowne?
- Lady. Helpe me hence, hoa
-
- Macd. Looke to the Lady
-
- Mal. Why doe we hold our tongues,
-That most may clayme this argument for ours?
- Donal. What should be spoken here,
-Where our Fate hid in an augure hole,
-May rush, and seize vs? Let's away,
-Our Teares are not yet brew'd
-
- Mal. Nor our strong Sorrow
-Vpon the foot of Motion
-
- Banq. Looke to the Lady:
-And when we haue our naked Frailties hid,
-That suffer in exposure; let vs meet,
-And question this most bloody piece of worke,
-To know it further. Feares and scruples shake vs:
-In the great Hand of God I stand, and thence,
-Against the vndivulg'd pretence, I fight
-Of Treasonous Mallice
-
- Macd. And so doe I
-
- All. So all
-
- Macb. Let's briefely put on manly readinesse,
-And meet i'th' Hall together
-
- All. Well contented.
-
-Exeunt.
-
- Malc. What will you doe?
-Let's not consort with them:
-To shew an vnfelt Sorrow, is an Office
-Which the false man do's easie.
-Ile to England
-
- Don. To Ireland, I:
-Our seperated fortune shall keepe vs both the safer:
-Where we are, there's Daggers in mens smiles;
-The neere in blood, the neerer bloody
-
- Malc. This murtherous Shaft that's shot,
-Hath not yet lighted: and our safest way,
-Is to auoid the ayme. Therefore to Horse,
-And let vs not be daintie of leaue-taking,
-But shift away: there's warrant in that Theft,
-Which steales it selfe, when there's no mercie left.
-
-Exeunt.
-
-
-
-Scena Quarta.
-
-Enter Rosse, with an Old man.
-
- Old man. Threescore and ten I can remember well,
-Within the Volume of which Time, I haue seene
-Houres dreadfull, and things strange: but this sore Night
-Hath trifled former knowings
-
- Rosse. Ha, good Father,
-Thou seest the Heauens, as troubled with mans Act,
-Threatens his bloody Stage: byth' Clock 'tis Day,
-And yet darke Night strangles the trauailing Lampe:
-Is't Nights predominance, or the Dayes shame,
-That Darknesse does the face of Earth intombe,
-When liuing Light should kisse it?
- Old man. 'Tis vnnaturall,
-Euen like the deed that's done: On Tuesday last,
-A Faulcon towring in her pride of place,
-Was by a Mowsing Owle hawkt at, and kill'd
-
- Rosse. And Duncans Horses,
-(A thing most strange, and certaine)
-Beauteous, and swift, the Minions of their Race,
-Turn'd wilde in nature, broke their stalls, flong out,
-Contending 'gainst Obedience, as they would
-Make Warre with Mankinde
-
- Old man. 'Tis said, they eate each other
-
- Rosse. They did so:
-To th' amazement of mine eyes that look'd vpon't.
-Enter Macduffe.
-
-Heere comes the good Macduffe.
-How goes the world Sir, now?
- Macd. Why see you not?
- Ross. Is't known who did this more then bloody deed?
- Macd. Those that Macbeth hath slaine
-
- Ross. Alas the day,
-What good could they pretend?
- Macd. They were subborned,
-Malcolme, and Donalbaine the Kings two Sonnes
-Are stolne away and fled, which puts vpon them
-Suspition of the deed
-
- Rosse. 'Gainst Nature still,
-Thriftlesse Ambition, that will rauen vp
-Thine owne liues meanes: Then 'tis most like,
-The Soueraignty will fall vpon Macbeth
-
- Macd. He is already nam'd, and gone to Scone
-To be inuested
-
- Rosse. Where is Duncans body?
- Macd. Carried to Colmekill,
-The Sacred Store-house of his Predecessors,
-And Guardian of their Bones
-
- Rosse. Will you to Scone?
- Macd. No Cosin, Ile to Fife
-
- Rosse. Well, I will thither
-
- Macd. Well may you see things wel done there: Adieu
-Least our old Robes sit easier then our new
-
- Rosse. Farewell, Father
-
- Old M. Gods benyson go with you, and with those
-That would make good of bad, and Friends of Foes.
-
-Exeunt. omnes
-
-Actus Tertius. Scena Prima.
-
-Enter Banquo.
-
- Banq. Thou hast it now, King, Cawdor, Glamis, all,
-As the weyard Women promis'd, and I feare
-Thou playd'st most fowly for't: yet it was saide
-It should not stand in thy Posterity,
-But that my selfe should be the Roote, and Father
-Of many Kings. If there come truth from them,
-As vpon thee Macbeth, their Speeches shine,
-Why by the verities on thee made good,
-May they not be my Oracles as well,
-And set me vp in hope. But hush, no more.
-
-Senit sounded. Enter Macbeth as King, Lady Lenox, Rosse, Lords,
-and
-Attendants.
-
- Macb. Heere's our chiefe Guest
-
- La. If he had beene forgotten,
-It had bene as a gap in our great Feast,
-And all-thing vnbecomming
-
- Macb. To night we hold a solemne Supper sir,
-And Ile request your presence
-
- Banq. Let your Highnesse
-Command vpon me, to the which my duties
-Are with a most indissoluble tye
-For euer knit
-
- Macb. Ride you this afternoone?
- Ban. I, my good Lord
-
- Macb. We should haue else desir'd your good aduice
-(Which still hath been both graue, and prosperous)
-In this dayes Councell: but wee'le take to morrow.
-Is't farre you ride?
- Ban. As farre, my Lord, as will fill vp the time
-'Twixt this, and Supper. Goe not my Horse the better,
-I must become a borrower of the Night,
-For a darke houre, or twaine
-
- Macb. Faile not our Feast
-
- Ban. My Lord, I will not
-
- Macb. We heare our bloody Cozens are bestow'd
-In England, and in Ireland, not confessing
-Their cruell Parricide, filling their hearers
-With strange inuention. But of that to morrow,
-When therewithall, we shall haue cause of State,
-Crauing vs ioyntly. Hye you to Horse:
-Adieu, till you returne at Night.
-Goes Fleance with you?
- Ban. I, my good Lord: our time does call vpon's
-
- Macb. I wish your Horses swift, and sure of foot:
-And so I doe commend you to their backs.
-Farwell.
-
-Exit Banquo.
-
-Let euery man be master of his time,
-Till seuen at Night, to make societie
-The sweeter welcome:
-We will keepe our selfe till Supper time alone:
-While then, God be with you.
-
-Exeunt. Lords.
-
-Sirrha, a word with you: Attend those men
-Our pleasure?
- Seruant. They are, my Lord, without the Pallace
-Gate
-
- Macb. Bring them before vs.
-
-Exit Seruant.
-
-To be thus, is nothing, but to be safely thus
-Our feares in Banquo sticke deepe,
-And in his Royaltie of Nature reignes that
-Which would be fear'd. 'Tis much he dares,
-And to that dauntlesse temper of his Minde,
-He hath a Wisdome, that doth guide his Valour,
-To act in safetie. There is none but he,
-Whose being I doe feare: and vnder him,
-My Genius is rebuk'd, as it is said
-Mark Anthonies was by Caesar. He chid the Sisters,
-When first they put the Name of King vpon me,
-And bad them speake to him. Then Prophet-like,
-They hayl'd him Father to a Line of Kings.
-Vpon my Head they plac'd a fruitlesse Crowne,
-And put a barren Scepter in my Gripe,
-Thence to be wrencht with an vnlineall Hand,
-No Sonne of mine succeeding: if't be so,
-For Banquo's Issue haue I fil'd my Minde,
-For them, the gracious Duncan haue I murther'd,
-Put Rancours in the Vessell of my Peace
-Onely for them, and mine eternall Iewell
-Giuen to the common Enemie of Man,
-To make them Kings, the Seedes of Banquo Kings.
-Rather then so, come Fate into the Lyst,
-And champion me to th' vtterance.
-Who's there?
-Enter Seruant, and two Murtherers.
-
-Now goe to the Doore, and stay there till we call.
-
-Exit Seruant.
-
-Was it not yesterday we spoke together?
- Murth. It was, so please your Highnesse
-
- Macb. Well then,
-Now haue you consider'd of my speeches:
-Know, that it was he, in the times past,
-Which held you so vnder fortune,
-Which you thought had been our innocent selfe.
-This I made good to you, in our last conference,
-Past in probation with you:
-How you were borne in hand, how crost:
-The Instruments: who wrought with them:
-And all things else, that might
-To halfe a Soule, and to a Notion craz'd,
-Say, Thus did Banquo
-
- 1.Murth. You made it knowne to vs
-
- Macb. I did so:
-And went further, which is now
-Our point of second meeting.
-Doe you finde your patience so predominant,
-In your nature, that you can let this goe?
-Are you so Gospell'd, to pray for this good man,
-And for his Issue, whose heauie hand
-Hath bow'd you to the Graue, and begger'd
-Yours for euer?
- 1.Murth. We are men, my Liege
-
- Macb. I, in the Catalogue ye goe for men,
-As Hounds, and Greyhounds, Mungrels, Spaniels, Curres,
-Showghes, Water-Rugs, and Demy-Wolues are clipt
-All by the Name of Dogges: the valued file
-Distinguishes the swift, the slow, the subtle,
-The House-keeper, the Hunter, euery one
-According to the gift, which bounteous Nature
-Hath in him clos'd: whereby he does receiue
-Particular addition, from the Bill,
-That writes them all alike: and so of men.
-Now, if you haue a station in the file,
-Not i'th' worst ranke of Manhood, say't,
-And I will put that Businesse in your Bosomes,
-Whose execution takes your Enemie off,
-Grapples you to the heart; and loue of vs,
-Who weare our Health but sickly in his Life,
-Which in his Death were perfect
-
- 2.Murth. I am one, my Liege,
-Whom the vile Blowes and Buffets of the World
-Hath so incens'd, that I am recklesse what I doe,
-To spight the World
-
- 1.Murth. And I another,
-So wearie with Disasters, tugg'd with Fortune,
-That I would set my Life on any Chance,
-To mend it, or be rid on't
-
- Macb. Both of you know Banquo was your Enemie
-
- Murth. True, my Lord
-
- Macb. So is he mine: and in such bloody distance,
-That euery minute of his being, thrusts
-Against my neer'st of Life: and though I could
-With bare-fac'd power sweepe him from my sight,
-And bid my will auouch it; yet I must not,
-For certaine friends that are both his, and mine,
-Whose loues I may not drop, but wayle his fall,
-Who I my selfe struck downe: and thence it is,
-That I to your assistance doe make loue,
-Masking the Businesse from the common Eye,
-For sundry weightie Reasons
-
- 2.Murth. We shall, my Lord,
-Performe what you command vs
-
- 1.Murth. Though our Liues-
- Macb. Your Spirits shine through you.
-Within this houre, at most,
-I will aduise you where to plant your selues,
-Acquaint you with the perfect Spy o'th' time,
-The moment on't, for't must be done to Night,
-And something from the Pallace: alwayes thought,
-That I require a clearenesse; and with him,
-To leaue no Rubs nor Botches in the Worke:
- Fleans , his Sonne, that keepes him companie,
-Whose absence is no lesse materiall to me,
-Then is his Fathers, must embrace the fate
-Of that darke houre: resolue your selues apart,
-Ile come to you anon
-
- Murth. We are resolu'd, my Lord
-
- Macb. Ile call vpon you straight: abide within,
-It is concluded: Banquo, thy Soules flight,
-If it finde Heauen, must finde it out to Night.
-
-Exeunt.
-
-
-Scena Secunda.
-
-Enter Macbeths Lady, and a Seruant.
-
- Lady. Is Banquo gone from Court?
- Seruant. I, Madame, but returnes againe to Night
-
- Lady. Say to the King, I would attend his leysure,
-For a few words
-
- Seruant. Madame, I will.
-Enter.
-
- Lady. Nought's had, all's spent.
-Where our desire is got without content:
-'Tis safer, to be that which we destroy,
-Then by destruction dwell in doubtfull ioy.
-Enter Macbeth.
-
-How now, my Lord, why doe you keepe alone?
-Of sorryest Fancies your Companions making,
-Vsing those Thoughts, which should indeed haue dy'd
-With them they thinke on: things without all remedie
-Should be without regard: what's done, is done
-
- Macb. We haue scorch'd the Snake, not kill'd it:
-Shee'le close, and be her selfe, whilest our poore Mallice
-Remaines in danger of her former Tooth.
-But let the frame of things dis-ioynt,
-Both the Worlds suffer,
-Ere we will eate our Meale in feare, and sleepe
-In the affliction of these terrible Dreames,
-That shake vs Nightly: Better be with the dead,
-Whom we, to gayne our peace, haue sent to peace,
-Then on the torture of the Minde to lye
-In restlesse extasie.
-Duncane is in his Graue:
-After Lifes fitfull Feuer, he sleepes well,
-Treason ha's done his worst: nor Steele, nor Poyson,
-Mallice domestique, forraine Leuie, nothing,
-Can touch him further
-
- Lady. Come on:
-Gentle my Lord, sleeke o're your rugged Lookes,
-Be bright and Iouiall among your Guests to Night
-
- Macb. So shall I Loue, and so I pray be you:
-Let your remembrance apply to Banquo,
-Present him Eminence, both with Eye and Tongue:
-Vnsafe the while, that wee must laue
-Our Honors in these flattering streames,
-And make our Faces Vizards to our Hearts,
-Disguising what they are
-
- Lady. You must leaue this
-
- Macb. O, full of Scorpions is my Minde, deare Wife:
-Thou know'st, that Banquo and his Fleans liues
-
- Lady. But in them, Natures Coppie's not eterne
-
- Macb. There's comfort yet, they are assaileable,
-Then be thou iocund: ere the Bat hath flowne
-His Cloyster'd flight, ere to black Heccats summons
-The shard-borne Beetle, with his drowsie hums,
-Hath rung Nights yawning Peale,
-There shall be done a deed of dreadfull note
-
- Lady. What's to be done?
- Macb. Be innocent of the knowledge, dearest Chuck,
-Till thou applaud the deed: Come, seeling Night,
-Skarfe vp the tender Eye of pittifull Day,
-And with thy bloodie and inuisible Hand
-Cancell and teare to pieces that great Bond,
-Which keepes me pale. Light thickens,
-And the Crow makes Wing toth' Rookie Wood:
-Good things of Day begin to droope, and drowse,
-Whiles Nights black Agents to their Prey's doe rowse.
-Thou maruell'st at my words: but hold thee still,
-Things bad begun, make strong themselues by ill:
-So prythee goe with me.
-
-Exeunt.
-
-
-Scena Tertia.
-
-Enter three Murtherers.
-
- 1. But who did bid thee ioyne with vs?
- 3. Macbeth
-
- 2. He needes not our mistrust, since he deliuers
-Our Offices, and what we haue to doe,
-To the direction iust
-
- 1. Then stand with vs:
-The West yet glimmers with some streakes of Day.
-Now spurres the lated Traueller apace,
-To gayne the timely Inne, and neere approches
-The subiect of our Watch
-
- 3. Hearke, I heare Horses
-
- Banquo within. Giue vs a Light there, hoa
-
- 2. Then 'tis hee:
-The rest, that are within the note of expectation,
-Alreadie are i'th' Court
-
- 1. His Horses goe about
-
- 3. Almost a mile: but he does vsually,
-So all men doe, from hence toth' Pallace Gate
-Make it their Walke.
-Enter Banquo and Fleans, with a Torch.
-
- 2. A Light, a Light
-
- 3. 'Tis hee
-
- 1. Stand too't
-
- Ban. It will be Rayne to Night
-
- 1. Let it come downe
-
- Ban. O, Trecherie!
-Flye good Fleans, flye, flye, flye,
-Thou may'st reuenge. O Slaue!
- 3. Who did strike out the Light?
- 1. Was't not the way?
- 3. There's but one downe: the Sonne is fled
-
- 2. We haue lost
-Best halfe of our Affaire
-
- 1. Well, let's away, and say how much is done.
-
-Exeunt.
-
-
-Scaena Quarta.
-
-Banquet prepar'd. Enter Macbeth, Lady, Rosse, Lenox, Lords, and
-Attendants.
-
- Macb. You know your owne degrees, sit downe:
-At first and last, the hearty welcome
-
- Lords. Thankes to your Maiesty
-
- Macb. Our selfe will mingle with Society,
-And play the humble Host:
-Our Hostesse keepes her State, but in best time
-We will require her welcome
-
- La. Pronounce it for me Sir, to all our Friends,
-For my heart speakes, they are welcome.
-Enter first Murtherer.
-
- Macb. See they encounter thee with their harts thanks
-Both sides are euen: heere Ile sit i'th' mid'st,
-Be large in mirth, anon wee'l drinke a Measure
-The Table round. There's blood vpon thy face
-
- Mur. 'Tis Banquo's then
-
- Macb. 'Tis better thee without, then he within.
-Is he dispatch'd?
- Mur. My Lord his throat is cut, that I did for him
-
- Mac. Thou art the best o'th' Cut-throats,
-Yet hee's good that did the like for Fleans:
-If thou did'st it, thou art the Non-pareill
-
- Mur. Most Royall Sir
-Fleans is scap'd
-
- Macb. Then comes my Fit againe:
-I had else beene perfect;
-Whole as the Marble, founded as the Rocke,
-As broad, and generall, as the casing Ayre:
-But now I am cabin'd, crib'd, confin'd, bound in
-To sawcy doubts, and feares. But Banquo's safe?
- Mur. I, my good Lord: safe in a ditch he bides,
-With twenty trenched gashes on his head;
-The least a Death to Nature
-
- Macb. Thankes for that:
-There the growne Serpent lyes, the worme that's fled
-Hath Nature that in time will Venom breed,
-No teeth for th' present. Get thee gone, to morrow
-Wee'l heare our selues againe.
-
-Exit Murderer.
-
- Lady. My Royall Lord,
-You do not giue the Cheere, the Feast is sold
-That is not often vouch'd, while 'tis a making:
-'Tis giuen, with welcome: to feede were best at home:
-From thence, the sawce to meate is Ceremony,
-Meeting were bare without it.
-Enter the Ghost of Banquo, and sits in Macbeths place.
-
- Macb. Sweet Remembrancer:
-Now good digestion waite on Appetite,
-And health on both
-
- Lenox. May't please your Highnesse sit
-
- Macb. Here had we now our Countries Honor, roof'd,
-Were the grac'd person of our Banquo present:
-Who, may I rather challenge for vnkindnesse,
-Then pitty for Mischance
-
- Rosse. His absence (Sir)
-Layes blame vpon his promise. Pleas't your Highnesse
-To grace vs with your Royall Company?
- Macb. The Table's full
-
- Lenox. Heere is a place reseru'd Sir
-
- Macb. Where?
- Lenox. Heere my good Lord.
-What is't that moues your Highnesse?
- Macb. Which of you haue done this?
- Lords. What, my good Lord?
- Macb. Thou canst not say I did it: neuer shake
-Thy goary lockes at me
-
- Rosse. Gentlemen rise, his Highnesse is not well
-
- Lady. Sit worthy Friends: my Lord is often thus,
-And hath beene from his youth. Pray you keepe Seat,
-The fit is momentary, vpon a thought
-He will againe be well. If much you note him
-You shall offend him, and extend his Passion,
-Feed, and regard him not. Are you a man?
- Macb. I, and a bold one, that dare looke on that
-Which might appall the Diuell
-
- La. O proper stuffe:
-This is the very painting of your feare:
-This is the Ayre-drawne-Dagger which you said
-Led you to Duncan. O, these flawes and starts
-(Impostors to true feare) would well become
-A womans story, at a Winters fire
-Authoriz'd by her Grandam: shame it selfe,
-Why do you make such faces? When all's done
-You looke but on a stoole
-
- Macb. Prythee see there:
-Behold, looke, loe, how say you:
-Why what care I, if thou canst nod, speake too.
-If Charnell houses, and our Graues must send
-Those that we bury, backe; our Monuments
-Shall be the Mawes of Kytes
-
- La. What? quite vnmann'd in folly
-
- Macb. If I stand heere, I saw him
-
- La. Fie for shame
-
- Macb. Blood hath bene shed ere now, i'th' olden time
-Ere humane Statute purg'd the gentle Weale:
-I, and since too, Murthers haue bene perform'd
-Too terrible for the eare. The times has bene,
-That when the Braines were out, the man would dye,
-And there an end: But now they rise againe
-With twenty mortall murthers on their crownes,
-And push vs from our stooles. This is more strange
-Then such a murther is
-
- La. My worthy Lord
-Your Noble Friends do lacke you
-
- Macb. I do forget:
-Do not muse at me my most worthy Friends,
-I haue a strange infirmity, which is nothing
-To those that know me. Come, loue and health to all,
-Then Ile sit downe: Giue me some Wine, fill full:
-Enter Ghost.
-
-I drinke to th' generall ioy o'th' whole Table,
-And to our deere Friend Banquo, whom we misse:
-Would he were heere: to all, and him we thirst,
-And all to all
-
- Lords. Our duties, and the pledge
-
- Mac. Auant, & quit my sight, let the earth hide thee:
-Thy bones are marrowlesse, thy blood is cold:
-Thou hast no speculation in those eyes
-Which thou dost glare with
-
- La. Thinke of this good Peeres
-But as a thing of Custome: 'Tis no other,
-Onely it spoyles the pleasure of the time
-
- Macb. What man dare, I dare:
-Approach thou like the rugged Russian Beare,
-The arm'd Rhinoceros, or th' Hircan Tiger,
-Take any shape but that, and my firme Nerues
-Shall neuer tremble. Or be aliue againe,
-And dare me to the Desart with thy Sword:
-If trembling I inhabit then, protest mee
-The Baby of a Girle. Hence horrible shadow,
-Vnreall mock'ry hence. Why so, being gone
-I am a man againe: pray you sit still
-
- La. You haue displac'd the mirth,
-Broke the good meeting, with most admir'd disorder
-
- Macb. Can such things be,
-And ouercome vs like a Summers Clowd,
-Without our speciall wonder? You make me strange
-Euen to the disposition that I owe,
-When now I thinke you can behold such sights,
-And keepe the naturall Rubie of your Cheekes,
-When mine is blanch'd with feare
-
- Rosse. What sights, my Lord?
- La. I pray you speake not: he growes worse & worse
-Question enrages him: at once, goodnight.
-Stand not vpon the order of your going,
-But go at once
-
- Len. Good night, and better health
-Attend his Maiesty
-
- La. A kinde goodnight to all.
-
-Exit Lords.
-
- Macb. It will haue blood they say:
-Blood will haue Blood:
-Stones haue beene knowne to moue, & Trees to speake:
-Augures, and vnderstood Relations, haue
-By Maggot Pyes, & Choughes, & Rookes brought forth
-The secret'st man of Blood. What is the night?
- La. Almost at oddes with morning, which is which
-
- Macb. How say'st thou that Macduff denies his person
-At our great bidding
-
- La. Did you send to him Sir?
- Macb. I heare it by the way: But I will send:
-There's not a one of them but in his house
-I keepe a Seruant Feed. I will to morrow
-(And betimes I will) to the weyard Sisters.
-More shall they speake: for now I am bent to know
-By the worst meanes, the worst, for mine owne good,
-All causes shall giue way. I am in blood
-Stept in so farre, that should I wade no more,
-Returning were as tedious as go ore:
-Strange things I haue in head, that will to hand,
-Which must be acted, ere they may be scand
-
- La. You lacke the season of all Natures, sleepe
-
- Macb. Come, wee'l to sleepe: My strange & self-abuse
-Is the initiate feare, that wants hard vse:
-We are yet but yong indeed.
-
-Exeunt.
-
-
-Scena Quinta.
-
-Thunder. Enter the three Witches, meeting Hecat.
-
- 1. Why how now Hecat, you looke angerly?
- Hec. Haue I not reason (Beldams) as you are?
-Sawcy, and ouer-bold, how did you dare
-To Trade, and Trafficke with Macbeth,
-In Riddles, and Affaires of death;
-And I the Mistris of your Charmes,
-The close contriuer of all harmes,
-Was neuer call'd to beare my part,
-Or shew the glory of our Art?
-And which is worse, all you haue done
-Hath bene but for a wayward Sonne,
-Spightfull, and wrathfull, who (as others do)
-Loues for his owne ends, not for you.
-But make amends now: Get you gon,
-And at the pit of Acheron
-Meete me i'th' Morning: thither he
-Will come, to know his Destinie.
-Your Vessels, and your Spels prouide,
-Your Charmes, and euery thing beside;
-I am for th' Ayre: This night Ile spend
-Vnto a dismall, and a Fatall end.
-Great businesse must be wrought ere Noone.
-Vpon the Corner of the Moone
-There hangs a vap'rous drop, profound,
-Ile catch it ere it come to ground;
-And that distill'd by Magicke slights,
-Shall raise such Artificiall Sprights,
-As by the strength of their illusion,
-Shall draw him on to his Confusion.
-He shall spurne Fate, scorne Death, and beare
-His hopes 'boue Wisedome, Grace, and Feare:
-And you all know, Security
-Is Mortals cheefest Enemie.
-
-Musicke, and a Song.
-
-Hearke, I am call'd: my little Spirit see
-Sits in Foggy cloud, and stayes for me.
-
-Sing within. Come away, come away, &c.
-
- 1 Come, let's make hast, shee'l soone be
-Backe againe.
-
-Exeunt.
-
-
-Scaena Sexta.
-
-Enter Lenox, and another Lord.
-
- Lenox. My former Speeches,
-Haue but hit your Thoughts
-Which can interpret farther: Onely I say
-Things haue bin strangely borne. The gracious Duncan
-Was pittied of Macbeth: marry he was dead:
-And the right valiant Banquo walk'd too late,
-Whom you may say (if't please you) Fleans kill'd,
-For Fleans fled: Men must not walke too late.
-Who cannot want the thought, how monstrous
-It was for Malcolme, and for Donalbane
-To kill their gracious Father? Damned Fact,
-How it did greeue Macbeth? Did he not straight
-In pious rage, the two delinquents teare,
-That were the Slaues of drinke, and thralles of sleepe?
-Was not that Nobly done? I, and wisely too:
-For 'twould haue anger'd any heart aliue
-To heare the men deny't. So that I say,
-He ha's borne all things well, and I do thinke,
-That had he Duncans Sonnes vnder his Key,
-(As, and't please Heauen he shall not) they should finde
-What 'twere to kill a Father: So should Fleans.
-But peace; for from broad words, and cause he fayl'd
-His presence at the Tyrants Feast, I heare
-Macduffe liues in disgrace. Sir, can you tell
-Where he bestowes himselfe?
- Lord. The Sonnes of Duncane
-(From whom this Tyrant holds the due of Birth)
-Liues in the English Court, and is receyu'd
-Of the most Pious Edward, with such grace,
-That the maleuolence of Fortune, nothing
-Takes from his high respect. Thither Macduffe
-Is gone, to pray the Holy King, vpon his ayd
-To wake Northumberland, and warlike Seyward,
-That by the helpe of these (with him aboue)
-To ratifie the Worke) we may againe
-Giue to our Tables meate, sleepe to our Nights:
-Free from our Feasts, and Banquets bloody kniues;
-Do faithfull Homage, and receiue free Honors,
-All which we pine for now. And this report
-Hath so exasperate their King, that hee
-Prepares for some attempt of Warre
-
- Len. Sent he to Macduffe?
- Lord. He did: and with an absolute Sir, not I
-The clowdy Messenger turnes me his backe,
-And hums; as who should say, you'l rue the time
-That clogges me with this Answer
-
- Lenox. And that well might
-Aduise him to a Caution, t' hold what distance
-His wisedome can prouide. Some holy Angell
-Flye to the Court of England, and vnfold
-His Message ere he come, that a swift blessing
-May soone returne to this our suffering Country,
-Vnder a hand accurs'd
-
- Lord. Ile send my Prayers with him.
-
-Exeunt.
-
-Actus Quartus. Scena Prima.
-
-Thunder. Enter the three Witches.
-
- 1 Thrice the brinded Cat hath mew'd
-
- 2 Thrice, and once the Hedge-Pigge whin'd
-
- 3 Harpier cries, 'tis time, 'tis time
-
- 1 Round about the Caldron go:
-In the poysond Entrailes throw
-Toad, that vnder cold stone,
-Dayes and Nights, ha's thirty one:
-Sweltred Venom sleeping got,
-Boyle thou first i'th' charmed pot
-
- All. Double, double, toile and trouble;
-Fire burne, and Cauldron bubble
-
- 2 Fillet of a Fenny Snake,
-In the Cauldron boyle and bake:
-Eye of Newt, and Toe of Frogge,
-Wooll of Bat, and Tongue of Dogge:
-Adders Forke, and Blinde-wormes Sting,
-Lizards legge, and Howlets wing:
-For a Charme of powrefull trouble,
-Like a Hell-broth, boyle and bubble
-
- All. Double, double, toyle and trouble,
-Fire burne, and Cauldron bubble
-
- 3 Scale of Dragon, Tooth of Wolfe,
-Witches Mummey, Maw, and Gulfe
-Of the rauin'd salt Sea sharke:
-Roote of Hemlocke, digg'd i'th' darke:
-Liuer of Blaspheming Iew,
-Gall of Goate, and Slippes of Yew,
-Sliuer'd in the Moones Ecclipse:
-Nose of Turke, and Tartars lips:
-Finger of Birth-strangled Babe,
-Ditch-deliuer'd by a Drab,
-Make the Grewell thicke, and slab.
-Adde thereto a Tigers Chawdron,
-For th' Ingredience of our Cawdron
-
- All. Double, double, toyle and trouble,
-Fire burne, and Cauldron bubble
-
- 2 Coole it with a Baboones blood,
-Then the Charme is firme and good.
-Enter Hecat, and the other three Witches.
-
- Hec. O well done: I commend your paines,
-And euery one shall share i'th' gaines:
-And now about the Cauldron sing
-Like Elues and Fairies in a Ring,
-Inchanting all that you put in.
-
-Musicke and a Song. Blacke Spirits, &c.
-
- 2 By the pricking of my Thumbes,
-Something wicked this way comes:
-Open Lockes, who euer knockes.
-Enter Macbeth.
-
- Macb. How now you secret, black, & midnight Hags?
-What is't you do?
- All. A deed without a name
-
- Macb. I coniure you, by that which you Professe,
-(How ere you come to know it) answer me:
-Though you vntye the Windes, and let them fight
-Against the Churches: Though the yesty Waues
-Confound and swallow Nauigation vp:
-Though bladed Corne be lodg'd, & Trees blown downe,
-Though Castles topple on their Warders heads:
-Though Pallaces, and Pyramids do slope
-Their heads to their Foundations: Though the treasure
-Of Natures Germaine, tumble altogether,
-Euen till destruction sicken: Answer me
-To what I aske you
-
- 1 Speake
-
- 2 Demand
-
- 3 Wee'l answer
-
- 1 Say, if th'hadst rather heare it from our mouthes,
-Or from our Masters
-
- Macb. Call 'em: let me see 'em
-
- 1 Powre in Sowes blood, that hath eaten
-Her nine Farrow: Greaze that's sweaten
-From the Murderers Gibbet, throw
-Into the Flame
-
- All. Come high or low:
-Thy Selfe and Office deaftly show.
-Thunder. 1. Apparation, an Armed Head.
-
- Macb. Tell me, thou vnknowne power
-
- 1 He knowes thy thought:
-Heare his speech, but say thou nought
-
- 1 Appar. Macbeth, Macbeth, Macbeth:
-Beware Macduffe,
-Beware the Thane of Fife: dismisse me. Enough.
-
-He Descends.
-
- Macb. What ere thou art, for thy good caution, thanks
-Thou hast harp'd my feare aright. But one word more
-
- 1 He will not be commanded: heere's another
-More potent then the first.
-
-Thunder. 2 Apparition, a Bloody Childe.
-
- 2 Appar. Macbeth, Macbeth, Macbeth
-
- Macb. Had I three eares, Il'd heare thee
-
- Appar. Be bloody, bold, & resolute:
-Laugh to scorne
-The powre of man: For none of woman borne
-Shall harme Macbeth.
-
-Descends.
-
- Mac. Then liue Macduffe: what need I feare of thee?
-But yet Ile make assurance: double sure,
-And take a Bond of Fate: thou shalt not liue,
-That I may tell pale-hearted Feare, it lies;
-And sleepe in spight of Thunder.
-
-Thunder 3 Apparation, a Childe Crowned, with a Tree in his hand.
-
-What is this, that rises like the issue of a King,
-And weares vpon his Baby-brow, the round
-And top of Soueraignty?
- All. Listen, but speake not too't
-
- 3 Appar. Be Lyon metled, proud, and take no care:
-Who chafes, who frets, or where Conspirers are:
-Macbeth shall neuer vanquish'd be, vntill
-Great Byrnam Wood, to high Dunsmane Hill
-Shall come against him.
-
-Descend.
-
- Macb. That will neuer bee:
-Who can impresse the Forrest, bid the Tree
-Vnfixe his earth-bound Root? Sweet boadments, good:
-Rebellious dead, rise neuer till the Wood
-Of Byrnan rise, and our high plac'd Macbeth
-Shall liue the Lease of Nature, pay his breath
-To time, and mortall Custome. Yet my Hart
-Throbs to know one thing: Tell me, if your Art
-Can tell so much: Shall Banquo's issue euer
-Reigne in this Kingdome?
- All. Seeke to know no more
-
- Macb. I will be satisfied. Deny me this,
-And an eternall Curse fall on you: Let me know.
-Why sinkes that Caldron? & what noise is this?
-
-Hoboyes
-
- 1 Shew
-
- 2 Shew
-
- 3 Shew
-
- All. Shew his Eyes, and greeue his Hart,
-Come like shadowes, so depart.
-
-A shew of eight Kings, and Banquo last, with a glasse in his hand.
-
- Macb. Thou art too like the Spirit of Banquo: Down:
-Thy Crowne do's seare mine Eye-bals. And thy haire
-Thou other Gold-bound-brow, is like the first:
-A third, is like the former. Filthy Hagges,
-Why do you shew me this? - A fourth? Start eyes!
-What will the Line stretch out to'th' cracke of Doome?
-Another yet? A seauenth? Ile see no more:
-And yet the eighth appeares, who beares a glasse,
-Which shewes me many more: and some I see,
-That two-fold Balles, and trebble Scepters carry.
-Horrible sight: Now I see 'tis true,
-For the Blood-bolter'd Banquo smiles vpon me,
-And points at them for his. What? is this so?
- 1 I Sir, all this is so. But why
-Stands Macbeth thus amazedly?
-Come Sisters, cheere we vp his sprights,
-And shew the best of our delights.
-Ile Charme the Ayre to giue a sound,
-While you performe your Antique round:
-That this great King may kindly say,
-Our duties, did his welcome pay.
-
-Musicke. The Witches Dance, and vanish.
-
- Macb. Where are they? Gone?
-Let this pernitious houre,
-Stand aye accursed in the Kalender.
-Come in, without there.
-Enter Lenox.
-
- Lenox. What's your Graces will
-
- Macb. Saw you the Weyard Sisters?
- Lenox. No my Lord
-
- Macb. Came they not by you?
- Lenox. No indeed my Lord
-
- Macb. Infected be the Ayre whereon they ride,
-And damn'd all those that trust them. I did heare
-The gallopping of Horse. Who was't came by?
- Len. 'Tis two or three my Lord, that bring you word:
-Macduff is fled to England
-
- Macb. Fled to England?
- Len. I, my good Lord
-
- Macb. Time, thou anticipat'st my dread exploits:
-The flighty purpose neuer is o're-tooke
-Vnlesse the deed go with it. From this moment,
-The very firstlings of my heart shall be
-The firstlings of my hand. And euen now
-To Crown my thoughts with Acts: be it thoght & done:
-The Castle of Macduff, I will surprize.
-Seize vpon Fife; giue to th' edge o'th' Sword
-His Wife, his Babes, and all vnfortunate Soules
-That trace him in his Line. No boasting like a Foole,
-This deed Ile do, before this purpose coole,
-But no more sights. Where are these Gentlemen?
-Come bring me where they are.
-
-Exeunt.
-
-Scena Secunda.
-
-Enter Macduffes Wife, her Son, and Rosse.
-
- Wife. What had he done, to make him fly the Land?
- Rosse. You must haue patience Madam
-
- Wife. He had none:
-His flight was madnesse: when our Actions do not,
-Our feares do make vs Traitors
-
- Rosse. You know not
-Whether it was his wisedome, or his feare
-
- Wife. Wisedom? to leaue his wife, to leaue his Babes,
-His Mansion, and his Titles, in a place
-From whence himselfe do's flye? He loues vs not,
-He wants the naturall touch. For the poore Wren
-(The most diminitiue of Birds) will fight,
-Her yong ones in her Nest, against the
<TRUNCATED>
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/it/resources/sort_by_value.txt
----------------------------------------------------------------------
diff --git a/crunch/src/it/resources/sort_by_value.txt b/crunch/src/it/resources/sort_by_value.txt
deleted file mode 100644
index 73f7d11..0000000
--- a/crunch/src/it/resources/sort_by_value.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-A 2
-B 1
-C 3
-D 2
-E 1
[14/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/Pipeline.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/Pipeline.java b/crunch/src/main/java/org/apache/crunch/Pipeline.java
deleted file mode 100644
index 84c720c..0000000
--- a/crunch/src/main/java/org/apache/crunch/Pipeline.java
+++ /dev/null
@@ -1,138 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * Manages the state of a pipeline execution.
- *
- */
-public interface Pipeline {
-
- /**
- * Set the {@code Configuration} to use with this pipeline.
- */
- void setConfiguration(Configuration conf);
-
- /**
- * Returns the name of this pipeline.
- *
- * @return Name of the pipeline
- */
- String getName();
-
- /**
- * Returns the {@code Configuration} instance associated with this pipeline.
- */
- Configuration getConfiguration();
-
- /**
- * Converts the given {@code Source} into a {@code PCollection} that is
- * available to jobs run using this {@code Pipeline} instance.
- *
- * @param source
- * The source of data
- * @return A PCollection that references the given source
- */
- <T> PCollection<T> read(Source<T> source);
-
- /**
- * A version of the read method for {@code TableSource} instances that map to
- * {@code PTable}s.
- *
- * @param tableSource
- * The source of the data
- * @return A PTable that references the given source
- */
- <K, V> PTable<K, V> read(TableSource<K, V> tableSource);
-
- /**
- * Write the given collection to the given target on the next pipeline run. The
- * system will check to see if the target's location already exists using the
- * {@code WriteMode.DEFAULT} rule for the given {@code Target}.
- *
- * @param collection
- * The collection
- * @param target
- * The output target
- */
- void write(PCollection<?> collection, Target target);
-
- /**
- * Write the contents of the {@code PCollection} to the given {@code Target},
- * using the storage format specified by the target and the given
- * {@code WriteMode} for cases where the referenced {@code Target}
- * already exists.
- *
- * @param collection
- * The collection
- * @param target
- * The target to write to
- * @param writeMode
- * The strategy to use for handling existing outputs
- */
- void write(PCollection<?> collection, Target target,
- Target.WriteMode writeMode);
-
- /**
- * Create the given PCollection and read the data it contains into the
- * returned Collection instance for client use.
- *
- * @param pcollection
- * The PCollection to materialize
- * @return the data from the PCollection as a read-only Collection
- */
- <T> Iterable<T> materialize(PCollection<T> pcollection);
-
- /**
- * Constructs and executes a series of MapReduce jobs in order to write data
- * to the output targets.
- */
- PipelineResult run();
-
- /**
- * Constructs and starts a series of MapReduce jobs in order ot write data to
- * the output targets, but returns a {@code ListenableFuture} to allow clients to control
- * job execution.
- * @return
- */
- PipelineExecution runAsync();
-
- /**
- * Run any remaining jobs required to generate outputs and then clean up any
- * intermediate data files that were created in this run or previous calls to
- * {@code run}.
- */
- PipelineResult done();
-
- /**
- * A convenience method for reading a text file.
- */
- PCollection<String> readTextFile(String pathName);
-
- /**
- * A convenience method for writing a text file.
- */
- <T> void writeTextFile(PCollection<T> collection, String pathName);
-
- /**
- * Turn on debug logging for jobs that are run from this pipeline.
- */
- void enableDebug();
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/PipelineExecution.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/PipelineExecution.java b/crunch/src/main/java/org/apache/crunch/PipelineExecution.java
deleted file mode 100644
index fc6bb91..0000000
--- a/crunch/src/main/java/org/apache/crunch/PipelineExecution.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import java.util.concurrent.TimeUnit;
-
-/**
- * A handle to allow clients to control a Crunch pipeline as it runs.
- *
- * This interface is thread-safe.
- */
-public interface PipelineExecution {
-
- enum Status { READY, RUNNING, SUCCEEDED, FAILED, KILLED }
-
- /** Returns the .dot file that allows a client to graph the Crunch execution plan for this
- * pipeline.
- */
- String getPlanDotFile();
-
- /** Blocks until pipeline completes or the specified waiting time elapsed. */
- void waitFor(long timeout, TimeUnit timeUnit) throws InterruptedException;
-
- /** Blocks until pipeline completes, i.e. {@code SUCCEEDED}, {@code FAILED} or {@code KILLED}. */
- void waitUntilDone() throws InterruptedException;
-
- Status getStatus();
-
- /** Retrieve the result of a pipeline if it has been completed, otherwise {@code null}. */
- PipelineResult getResult();
-
- /**
- * Kills the pipeline if it is running, no-op otherwise.
- *
- * This method only delivers a kill signal to the pipeline, and does not guarantee the pipeline exits on return.
- * To wait for completely exits, use {@link #waitUntilDone()} after this call.
- */
- void kill() throws InterruptedException;
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/PipelineResult.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/PipelineResult.java b/crunch/src/main/java/org/apache/crunch/PipelineResult.java
deleted file mode 100644
index 90b1067..0000000
--- a/crunch/src/main/java/org/apache/crunch/PipelineResult.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import java.util.List;
-
-import org.apache.hadoop.mapreduce.Counter;
-import org.apache.hadoop.mapreduce.Counters;
-
-import com.google.common.collect.ImmutableList;
-
-/**
- * Container for the results of a call to {@code run} or {@code done} on the
- * Pipeline interface that includes details and statistics about the component
- * stages of the data pipeline.
- */
-public class PipelineResult {
-
- public static class StageResult {
-
- private final String stageName;
- private final Counters counters;
-
- public StageResult(String stageName, Counters counters) {
- this.stageName = stageName;
- this.counters = counters;
- }
-
- public String getStageName() {
- return stageName;
- }
-
- public Counters getCounters() {
- return counters;
- }
-
- public Counter findCounter(Enum<?> key) {
- return counters.findCounter(key);
- }
-
- public long getCounterValue(Enum<?> key) {
- return findCounter(key).getValue();
- }
- }
-
- public static final PipelineResult EMPTY = new PipelineResult(ImmutableList.<StageResult> of());
-
- private final List<StageResult> stageResults;
-
- public PipelineResult(List<StageResult> stageResults) {
- this.stageResults = ImmutableList.copyOf(stageResults);
- }
-
- public boolean succeeded() {
- return !stageResults.isEmpty();
- }
-
- public List<StageResult> getStageResults() {
- return stageResults;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/Source.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/Source.java b/crunch/src/main/java/org/apache/crunch/Source.java
deleted file mode 100644
index f54d135..0000000
--- a/crunch/src/main/java/org/apache/crunch/Source.java
+++ /dev/null
@@ -1,52 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import java.io.IOException;
-
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.Job;
-
-/**
- * A {@code Source} represents an input data set that is an input to one or more
- * MapReduce jobs.
- *
- */
-public interface Source<T> {
- /**
- * Returns the {@code PType} for this source.
- */
- PType<T> getType();
-
- /**
- * Configure the given job to use this source as an input.
- *
- * @param job
- * The job to configure
- * @param inputId
- * For a multi-input job, an identifier for this input to the job
- * @throws IOException
- */
- void configureSource(Job job, int inputId) throws IOException;
-
- /**
- * Returns the number of bytes in this {@code Source}.
- */
- long getSize(Configuration configuration);
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/SourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/SourceTarget.java b/crunch/src/main/java/org/apache/crunch/SourceTarget.java
deleted file mode 100644
index 09c03c6..0000000
--- a/crunch/src/main/java/org/apache/crunch/SourceTarget.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-/**
- * An interface for classes that implement both the {@code Source} and the
- * {@code Target} interfaces.
- *
- */
-public interface SourceTarget<T> extends Source<T>, Target {
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/TableSource.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/TableSource.java b/crunch/src/main/java/org/apache/crunch/TableSource.java
deleted file mode 100644
index ff27346..0000000
--- a/crunch/src/main/java/org/apache/crunch/TableSource.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import org.apache.crunch.types.PTableType;
-
-/**
- * The interface {@code Source} implementations that return a {@link PTable}.
- *
- */
-public interface TableSource<K, V> extends Source<Pair<K, V>> {
- PTableType<K, V> getTableType();
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/TableSourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/TableSourceTarget.java b/crunch/src/main/java/org/apache/crunch/TableSourceTarget.java
deleted file mode 100644
index 9b1ed34..0000000
--- a/crunch/src/main/java/org/apache/crunch/TableSourceTarget.java
+++ /dev/null
@@ -1,25 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-/**
- * An interface for classes that implement both the {@code TableSource} and the
- * {@code Target} interfaces.
- */
-public interface TableSourceTarget<K, V> extends TableSource<K, V>, SourceTarget<Pair<K, V>> {
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/Target.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/Target.java b/crunch/src/main/java/org/apache/crunch/Target.java
deleted file mode 100644
index 0a0c23d..0000000
--- a/crunch/src/main/java/org/apache/crunch/Target.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import org.apache.crunch.io.OutputHandler;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * A {@code Target} represents the output destination of a Crunch {@code PCollection}
- * in the context of a Crunch job.
- */
-public interface Target {
-
- /**
- * An enum to represent different options the client may specify
- * for handling the case where the output path, table, etc. referenced
- * by a {@code Target} already exists.
- */
- enum WriteMode {
- /**
- * Check to see if the output target already exists before running
- * the pipeline, and if it does, print an error and throw an exception.
- */
- DEFAULT,
-
- /**
- * Check to see if the output target already exists, and if it does,
- * delete it and overwrite it with the new output (if any).
- */
- OVERWRITE,
-
- /**
- * If the output target does not exist, create it. If it does exist,
- * add the output of this pipeline to the target. This was the
- * behavior in Crunch up to version 0.4.0.
- */
- APPEND
- }
-
- /**
- * Apply the given {@code WriteMode} to this {@code Target} instance.
- *
- * @param writeMode The strategy for handling existing outputs
- * @param conf The ever-useful {@code Configuration} instance
- */
- void handleExisting(WriteMode writeMode, Configuration conf);
-
- /**
- * Checks to see if this {@code Target} instance is compatible with the
- * given {@code PType}.
- *
- * @param handler The {@link OutputHandler} that is managing the output for the job
- * @param ptype The {@code PType} to check
- * @return True if this Target can write data in the form of the given {@code PType},
- * false otherwise
- */
- boolean accept(OutputHandler handler, PType<?> ptype);
-
- /**
- * Attempt to create the {@code SourceTarget} type that corresponds to this {@code Target}
- * for the given {@code PType}, if possible. If it is not possible, return {@code null}.
- *
- * @param ptype The {@code PType} to use in constructing the {@code SourceTarget}
- * @return A new {@code SourceTarget} or null if such a {@code SourceTarget} does not exist
- */
- <T> SourceTarget<T> asSourceTarget(PType<T> ptype);
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/Tuple.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/Tuple.java b/crunch/src/main/java/org/apache/crunch/Tuple.java
deleted file mode 100644
index 4e602ff..0000000
--- a/crunch/src/main/java/org/apache/crunch/Tuple.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-/**
- * A fixed-size collection of Objects, used in Crunch for representing joins
- * between {@code PCollection}s.
- *
- */
-public interface Tuple {
-
- /**
- * Returns the Object at the given index.
- */
- Object get(int index);
-
- /**
- * Returns the number of elements in this Tuple.
- */
- int size();
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/Tuple3.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/Tuple3.java b/crunch/src/main/java/org/apache/crunch/Tuple3.java
deleted file mode 100644
index 4372811..0000000
--- a/crunch/src/main/java/org/apache/crunch/Tuple3.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import org.apache.commons.lang.builder.HashCodeBuilder;
-
-/**
- * A convenience class for three-element {@link Tuple}s.
- */
-public class Tuple3<V1, V2, V3> implements Tuple {
-
- private final V1 first;
- private final V2 second;
- private final V3 third;
-
- public static <A, B, C> Tuple3<A, B, C> of(A a, B b, C c) {
- return new Tuple3<A, B, C>(a, b, c);
- }
-
- public Tuple3(V1 first, V2 second, V3 third) {
- this.first = first;
- this.second = second;
- this.third = third;
- }
-
- public V1 first() {
- return first;
- }
-
- public V2 second() {
- return second;
- }
-
- public V3 third() {
- return third;
- }
-
- public Object get(int index) {
- switch (index) {
- case 0:
- return first;
- case 1:
- return second;
- case 2:
- return third;
- default:
- throw new ArrayIndexOutOfBoundsException();
- }
- }
-
- public int size() {
- return 3;
- }
-
- @Override
- public int hashCode() {
- HashCodeBuilder hcb = new HashCodeBuilder();
- return hcb.append(first).append(second).append(third).toHashCode();
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
- return false;
- Tuple3<?, ?, ?> other = (Tuple3<?, ?, ?>) obj;
- return (first == other.first || (first != null && first.equals(other.first)))
- && (second == other.second || (second != null && second.equals(other.second)))
- && (third == other.third || (third != null && third.equals(other.third)));
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder("Tuple3[");
- sb.append(first).append(",").append(second).append(",").append(third);
- return sb.append("]").toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/Tuple4.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/Tuple4.java b/crunch/src/main/java/org/apache/crunch/Tuple4.java
deleted file mode 100644
index f161371..0000000
--- a/crunch/src/main/java/org/apache/crunch/Tuple4.java
+++ /dev/null
@@ -1,105 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import org.apache.commons.lang.builder.HashCodeBuilder;
-
-/**
- * A convenience class for four-element {@link Tuple}s.
- */
-public class Tuple4<V1, V2, V3, V4> implements Tuple {
-
- private final V1 first;
- private final V2 second;
- private final V3 third;
- private final V4 fourth;
-
- public static <A, B, C, D> Tuple4<A, B, C, D> of(A a, B b, C c, D d) {
- return new Tuple4<A, B, C, D>(a, b, c, d);
- }
-
- public Tuple4(V1 first, V2 second, V3 third, V4 fourth) {
- this.first = first;
- this.second = second;
- this.third = third;
- this.fourth = fourth;
- }
-
- public V1 first() {
- return first;
- }
-
- public V2 second() {
- return second;
- }
-
- public V3 third() {
- return third;
- }
-
- public V4 fourth() {
- return fourth;
- }
-
- public Object get(int index) {
- switch (index) {
- case 0:
- return first;
- case 1:
- return second;
- case 2:
- return third;
- case 3:
- return fourth;
- default:
- throw new ArrayIndexOutOfBoundsException();
- }
- }
-
- public int size() {
- return 4;
- }
-
- @Override
- public int hashCode() {
- HashCodeBuilder hcb = new HashCodeBuilder();
- return hcb.append(first).append(second).append(third).append(fourth).toHashCode();
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
- return false;
- Tuple4<?, ?, ?, ?> other = (Tuple4<?, ?, ?, ?>) obj;
- return (first == other.first || (first != null && first.equals(other.first)))
- && (second == other.second || (second != null && second.equals(other.second)))
- && (third == other.third || (third != null && third.equals(other.third)))
- && (fourth == other.fourth || (fourth != null && fourth.equals(other.fourth)));
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder("Tuple4[");
- sb.append(first).append(",").append(second).append(",").append(third);
- return sb.append(",").append(fourth).append("]").toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/TupleN.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/TupleN.java b/crunch/src/main/java/org/apache/crunch/TupleN.java
deleted file mode 100644
index e5eceb5..0000000
--- a/crunch/src/main/java/org/apache/crunch/TupleN.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch;
-
-import java.util.Arrays;
-
-import org.apache.commons.lang.builder.HashCodeBuilder;
-
-/**
- * A {@link Tuple} instance for an arbitrary number of values.
- */
-public class TupleN implements Tuple {
-
- private final Object values[];
-
- public static TupleN of(Object... values) {
- return new TupleN(values);
- }
-
- public TupleN(Object... values) {
- this.values = new Object[values.length];
- System.arraycopy(values, 0, this.values, 0, values.length);
- }
-
- public Object get(int index) {
- return values[index];
- }
-
- public int size() {
- return values.length;
- }
-
- @Override
- public int hashCode() {
- HashCodeBuilder hcb = new HashCodeBuilder();
- for (Object v : values) {
- hcb.append(v);
- }
- return hcb.toHashCode();
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj)
- return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
- return false;
- TupleN other = (TupleN) obj;
- return Arrays.equals(this.values, other.values);
- }
-
- @Override
- public String toString() {
- return Arrays.toString(values);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/fn/Aggregators.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/fn/Aggregators.java b/crunch/src/main/java/org/apache/crunch/fn/Aggregators.java
deleted file mode 100644
index 0ac79e2..0000000
--- a/crunch/src/main/java/org/apache/crunch/fn/Aggregators.java
+++ /dev/null
@@ -1,1111 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import java.math.BigInteger;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Set;
-import java.util.SortedSet;
-
-import org.apache.crunch.Aggregator;
-import org.apache.crunch.CombineFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.PGroupedTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-import org.apache.crunch.util.Tuples;
-import org.apache.hadoop.conf.Configuration;
-
-import com.google.common.base.Joiner;
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-
-
-/**
- * A collection of pre-defined {@link org.apache.crunch.Aggregator}s.
- *
- * <p>The factory methods of this class return {@link org.apache.crunch.Aggregator}
- * instances that you can use to combine the values of a {@link PGroupedTable}.
- * In most cases, they turn a multimap (multiple entries per key) into a map (one
- * entry per key).</p>
- *
- * <p><strong>Note</strong>: When using composed aggregators, like those built by the
- * {@link #pairAggregator(Aggregator, Aggregator) pairAggregator()}
- * factory method, you typically don't want to put in the same child aggregator more than once,
- * even if all child aggregators have the same type. In most cases, this is what you want:</p>
- *
- * <pre>
- * PTable<K, Long> result = groupedTable.combineValues(
- * pairAggregator(SUM_LONGS(), SUM_LONGS())
- * );
- * </pre>
- */
-public final class Aggregators {
-
- private Aggregators() {
- // utility class, not for instantiation
- }
-
- /**
- * Sum up all {@code long} values.
- * @return The newly constructed instance
- */
- public static Aggregator<Long> SUM_LONGS() {
- return new SumLongs();
- }
-
- /**
- * Sum up all {@code int} values.
- * @return The newly constructed instance
- */
- public static Aggregator<Integer> SUM_INTS() {
- return new SumInts();
- }
-
- /**
- * Sum up all {@code float} values.
- * @return The newly constructed instance
- */
- public static Aggregator<Float> SUM_FLOATS() {
- return new SumFloats();
- }
-
- /**
- * Sum up all {@code double} values.
- * @return The newly constructed instance
- */
- public static Aggregator<Double> SUM_DOUBLES() {
- return new SumDoubles();
- }
-
- /**
- * Sum up all {@link BigInteger} values.
- * @return The newly constructed instance
- */
- public static Aggregator<BigInteger> SUM_BIGINTS() {
- return new SumBigInts();
- }
-
- /**
- * Return the maximum of all given {@code long} values.
- * @return The newly constructed instance
- */
- public static Aggregator<Long> MAX_LONGS() {
- return new MaxLongs();
- }
-
- /**
- * Return the {@code n} largest {@code long} values (or fewer if there are fewer
- * values than {@code n}).
- * @param n The number of values to return
- * @return The newly constructed instance
- */
- public static Aggregator<Long> MAX_LONGS(int n) {
- return new MaxLongs();
- }
-
- /**
- * Return the maximum of all given {@code int} values.
- * @return The newly constructed instance
- */
- public static Aggregator<Integer> MAX_INTS() {
- return new MaxInts();
- }
-
- /**
- * Return the {@code n} largest {@code int} values (or fewer if there are fewer
- * values than {@code n}).
- * @param n The number of values to return
- * @return The newly constructed instance
- */
- public static Aggregator<Integer> MAX_INTS(int n) {
- return new MaxNAggregator<Integer>(n);
- }
-
- /**
- * Return the maximum of all given {@code float} values.
- * @return The newly constructed instance
- */
- public static Aggregator<Float> MAX_FLOATS() {
- return new MaxFloats();
- }
-
- /**
- * Return the {@code n} largest {@code float} values (or fewer if there are fewer
- * values than {@code n}).
- * @param n The number of values to return
- * @return The newly constructed instance
- */
- public static Aggregator<Float> MAX_FLOATS(int n) {
- return new MaxNAggregator<Float>(n);
- }
-
- /**
- * Return the maximum of all given {@code double} values.
- * @return The newly constructed instance
- */
- public static Aggregator<Double> MAX_DOUBLES() {
- return new MaxDoubles();
- }
-
- /**
- * Return the {@code n} largest {@code double} values (or fewer if there are fewer
- * values than {@code n}).
- * @param n The number of values to return
- * @return The newly constructed instance
- */
- public static Aggregator<Double> MAX_DOUBLES(int n) {
- return new MaxNAggregator<Double>(n);
- }
-
- /**
- * Return the maximum of all given {@link BigInteger} values.
- * @return The newly constructed instance
- */
- public static Aggregator<BigInteger> MAX_BIGINTS() {
- return new MaxBigInts();
- }
-
- /**
- * Return the {@code n} largest {@link BigInteger} values (or fewer if there are fewer
- * values than {@code n}).
- * @param n The number of values to return
- * @return The newly constructed instance
- */
- public static Aggregator<BigInteger> MAX_BIGINTS(int n) {
- return new MaxNAggregator<BigInteger>(n);
- }
-
- /**
- * Return the {@code n} largest values (or fewer if there are fewer
- * values than {@code n}).
- * @param n The number of values to return
- * @param cls The type of the values to aggregate (must implement {@link Comparable}!)
- * @return The newly constructed instance
- */
- public static <V extends Comparable<V>> Aggregator<V> MAX_N(int n, Class<V> cls) {
- return new MaxNAggregator<V>(n);
- }
-
- /**
- * Return the minimum of all given {@code long} values.
- * @return The newly constructed instance
- */
- public static Aggregator<Long> MIN_LONGS() {
- return new MinLongs();
- }
-
- /**
- * Return the {@code n} smallest {@code long} values (or fewer if there are fewer
- * values than {@code n}).
- * @param n The number of values to return
- * @return The newly constructed instance
- */
- public static Aggregator<Long> MIN_LONGS(int n) {
- return new MinNAggregator<Long>(n);
- }
-
- /**
- * Return the minimum of all given {@code int} values.
- * @return The newly constructed instance
- */
- public static Aggregator<Integer> MIN_INTS() {
- return new MinInts();
- }
-
- /**
- * Return the {@code n} smallest {@code int} values (or fewer if there are fewer
- * values than {@code n}).
- * @param n The number of values to return
- * @return The newly constructed instance
- */
- public static Aggregator<Integer> MIN_INTS(int n) {
- return new MinNAggregator<Integer>(n);
- }
-
- /**
- * Return the minimum of all given {@code float} values.
- * @return The newly constructed instance
- */
- public static Aggregator<Float> MIN_FLOATS() {
- return new MinFloats();
- }
-
- /**
- * Return the {@code n} smallest {@code float} values (or fewer if there are fewer
- * values than {@code n}).
- * @param n The number of values to return
- * @return The newly constructed instance
- */
- public static Aggregator<Float> MIN_FLOATS(int n) {
- return new MinNAggregator<Float>(n);
- }
-
- /**
- * Return the minimum of all given {@code double} values.
- * @return The newly constructed instance
- */
- public static Aggregator<Double> MIN_DOUBLES() {
- return new MinDoubles();
- }
-
- /**
- * Return the {@code n} smallest {@code double} values (or fewer if there are fewer
- * values than {@code n}).
- * @param n The number of values to return
- * @return The newly constructed instance
- */
- public static Aggregator<Double> MIN_DOUBLES(int n) {
- return new MinNAggregator<Double>(n);
- }
-
- /**
- * Return the minimum of all given {@link BigInteger} values.
- * @return The newly constructed instance
- */
- public static Aggregator<BigInteger> MIN_BIGINTS() {
- return new MinBigInts();
- }
-
- /**
- * Return the {@code n} smallest {@link BigInteger} values (or fewer if there are fewer
- * values than {@code n}).
- * @param n The number of values to return
- * @return The newly constructed instance
- */
- public static Aggregator<BigInteger> MIN_BIGINTS(int n) {
- return new MinNAggregator<BigInteger>(n);
- }
-
- /**
- * Return the {@code n} smallest values (or fewer if there are fewer
- * values than {@code n}).
- * @param n The number of values to return
- * @param cls The type of the values to aggregate (must implement {@link Comparable}!)
- * @return The newly constructed instance
- */
- public static <V extends Comparable<V>> Aggregator<V> MIN_N(int n, Class<V> cls) {
- return new MinNAggregator<V>(n);
- }
-
- /**
- * Return the first {@code n} values (or fewer if there are fewer values than {@code n}).
- *
- * @param n The number of values to return
- * @return The newly constructed instance
- */
- public static <V> Aggregator<V> FIRST_N(int n) {
- return new FirstNAggregator<V>(n);
- }
-
- /**
- * Return the last {@code n} values (or fewer if there are fewer values than {@code n}).
- *
- * @param n The number of values to return
- * @return The newly constructed instance
- */
- public static <V> Aggregator<V> LAST_N(int n) {
- return new LastNAggregator<V>(n);
- }
-
- /**
- * Concatenate strings, with a separator between strings. There
- * is no limits of length for the concatenated string.
- *
- * <p><em>Note: String concatenation is not commutative, which means the
- * result of the aggregation is not deterministic!</em></p>
- *
- * @param separator
- * the separator which will be appended between each string
- * @param skipNull
- * define if we should skip null values. Throw
- * NullPointerException if set to false and there is a null
- * value.
- * @return The newly constructed instance
- */
- public static Aggregator<String> STRING_CONCAT(String separator, boolean skipNull) {
- return new StringConcatAggregator(separator, skipNull);
- }
-
- /**
- * Concatenate strings, with a separator between strings. You can specify
- * the maximum length of the output string and of the input strings, if
- * they are > 0. If a value is <= 0, there is no limit.
- *
- * <p>Any too large string (or any string which would made the output too
- * large) will be silently discarded.</p>
- *
- * <p><em>Note: String concatenation is not commutative, which means the
- * result of the aggregation is not deterministic!</em></p>
- *
- * @param separator
- * the separator which will be appended between each string
- * @param skipNull
- * define if we should skip null values. Throw
- * NullPointerException if set to false and there is a null
- * value.
- * @param maxOutputLength
- * the maximum length of the output string. If it's set <= 0,
- * there is no limit. The number of characters of the output
- * string will be < maxOutputLength.
- * @param maxInputLength
- * the maximum length of the input strings. If it's set <= 0,
- * there is no limit. The number of characters of the input string
- * will be < maxInputLength to be concatenated.
- * @return The newly constructed instance
- */
- public static Aggregator<String> STRING_CONCAT(String separator, boolean skipNull,
- long maxOutputLength, long maxInputLength) {
- return new StringConcatAggregator(separator, skipNull, maxOutputLength, maxInputLength);
- }
-
- /**
- * Collect the unique elements of the input, as defined by the {@code equals} method for
- * the input objects. No guarantees are made about the order in which the final elements
- * will be returned.
- *
- * @return The newly constructed instance
- */
- public static <V> Aggregator<V> UNIQUE_ELEMENTS() {
- return new SetAggregator<V>();
- }
-
- /**
- * Collect a sample of unique elements from the input, where 'unique' is defined by
- * the {@code equals} method for the input objects. No guarantees are made about which
- * elements will be returned, simply that there will not be any more than the given sample
- * size for any key.
- *
- * @param maximumSampleSize The maximum number of unique elements to return per key
- * @return The newly constructed instance
- */
- public static <V> Aggregator<V> SAMPLE_UNIQUE_ELEMENTS(int maximumSampleSize) {
- return new SetAggregator<V>(maximumSampleSize);
- }
-
- /**
- * Apply separate aggregators to each component of a {@link Pair}.
- */
- public static <V1, V2> Aggregator<Pair<V1, V2>> pairAggregator(
- Aggregator<V1> a1, Aggregator<V2> a2) {
- return new PairAggregator<V1, V2>(a1, a2);
- }
-
- /**
- * Apply separate aggregators to each component of a {@link Tuple3}.
- */
- public static <V1, V2, V3> Aggregator<Tuple3<V1, V2, V3>> tripAggregator(
- Aggregator<V1> a1, Aggregator<V2> a2, Aggregator<V3> a3) {
- return new TripAggregator<V1, V2, V3>(a1, a2, a3);
- }
-
- /**
- * Apply separate aggregators to each component of a {@link Tuple4}.
- */
- public static <V1, V2, V3, V4> Aggregator<Tuple4<V1, V2, V3, V4>> quadAggregator(
- Aggregator<V1> a1, Aggregator<V2> a2, Aggregator<V3> a3, Aggregator<V4> a4) {
- return new QuadAggregator<V1, V2, V3, V4>(a1, a2, a3, a4);
- }
-
- /**
- * Apply separate aggregators to each component of a {@link Tuple}.
- */
- public static Aggregator<TupleN> tupleAggregator(Aggregator<?>... aggregators) {
- return new TupleNAggregator(aggregators);
- }
-
- /**
- * Wrap a {@link CombineFn} adapter around the given aggregator.
- *
- * @param aggregator The instance to wrap
- * @return A {@link CombineFn} delegating to {@code aggregator}
- */
- public static final <K, V> CombineFn<K, V> toCombineFn(Aggregator<V> aggregator) {
- return new AggregatorCombineFn<K, V>(aggregator);
- }
-
- /**
- * Base class for aggregators that do not require any initialization.
- */
- public static abstract class SimpleAggregator<T> implements Aggregator<T> {
- @Override
- public void initialize(Configuration conf) {
- // No-op
- }
- }
-
- /**
- * A {@code CombineFn} that delegates all of the actual work to an
- * {@code Aggregator} instance.
- */
- private static class AggregatorCombineFn<K, V> extends CombineFn<K, V> {
- // TODO: Has to be fully qualified until CombineFn.Aggregator can be removed.
- private final org.apache.crunch.Aggregator<V> aggregator;
-
- public AggregatorCombineFn(org.apache.crunch.Aggregator<V> aggregator) {
- this.aggregator = aggregator;
- }
-
- @Override
- public void initialize() {
- aggregator.initialize(getConfiguration());
- }
-
- @Override
- public void process(Pair<K, Iterable<V>> input, Emitter<Pair<K, V>> emitter) {
- aggregator.reset();
- for (V v : input.second()) {
- aggregator.update(v);
- }
- for (V v : aggregator.results()) {
- emitter.emit(Pair.of(input.first(), v));
- }
- }
- }
-
- private static class SumLongs extends SimpleAggregator<Long> {
- private long sum = 0;
-
- @Override
- public void reset() {
- sum = 0;
- }
-
- @Override
- public void update(Long next) {
- sum += next;
- }
-
- @Override
- public Iterable<Long> results() {
- return ImmutableList.of(sum);
- }
- }
-
- private static class SumInts extends SimpleAggregator<Integer> {
- private int sum = 0;
-
- @Override
- public void reset() {
- sum = 0;
- }
-
- @Override
- public void update(Integer next) {
- sum += next;
- }
-
- @Override
- public Iterable<Integer> results() {
- return ImmutableList.of(sum);
- }
- }
-
- private static class SumFloats extends SimpleAggregator<Float> {
- private float sum = 0;
-
- @Override
- public void reset() {
- sum = 0f;
- }
-
- @Override
- public void update(Float next) {
- sum += next;
- }
-
- @Override
- public Iterable<Float> results() {
- return ImmutableList.of(sum);
- }
- }
-
- private static class SumDoubles extends SimpleAggregator<Double> {
- private double sum = 0;
-
- @Override
- public void reset() {
- sum = 0f;
- }
-
- @Override
- public void update(Double next) {
- sum += next;
- }
-
- @Override
- public Iterable<Double> results() {
- return ImmutableList.of(sum);
- }
- }
-
- private static class SumBigInts extends SimpleAggregator<BigInteger> {
- private BigInteger sum = BigInteger.ZERO;
-
- @Override
- public void reset() {
- sum = BigInteger.ZERO;
- }
-
- @Override
- public void update(BigInteger next) {
- sum = sum.add(next);
- }
-
- @Override
- public Iterable<BigInteger> results() {
- return ImmutableList.of(sum);
- }
- }
-
- private static class MaxLongs extends SimpleAggregator<Long> {
- private Long max = null;
-
- @Override
- public void reset() {
- max = null;
- }
-
- @Override
- public void update(Long next) {
- if (max == null || max < next) {
- max = next;
- }
- }
-
- @Override
- public Iterable<Long> results() {
- return ImmutableList.of(max);
- }
- }
-
- private static class MaxInts extends SimpleAggregator<Integer> {
- private Integer max = null;
-
- @Override
- public void reset() {
- max = null;
- }
-
- @Override
- public void update(Integer next) {
- if (max == null || max < next) {
- max = next;
- }
- }
-
- @Override
- public Iterable<Integer> results() {
- return ImmutableList.of(max);
- }
- }
-
- private static class MaxFloats extends SimpleAggregator<Float> {
- private Float max = null;
-
- @Override
- public void reset() {
- max = null;
- }
-
- @Override
- public void update(Float next) {
- if (max == null || max < next) {
- max = next;
- }
- }
-
- @Override
- public Iterable<Float> results() {
- return ImmutableList.of(max);
- }
- }
-
- private static class MaxDoubles extends SimpleAggregator<Double> {
- private Double max = null;
-
- @Override
- public void reset() {
- max = null;
- }
-
- @Override
- public void update(Double next) {
- if (max == null || max < next) {
- max = next;
- }
- }
-
- @Override
- public Iterable<Double> results() {
- return ImmutableList.of(max);
- }
- }
-
- private static class MaxBigInts extends SimpleAggregator<BigInteger> {
- private BigInteger max = null;
-
- @Override
- public void reset() {
- max = null;
- }
-
- @Override
- public void update(BigInteger next) {
- if (max == null || max.compareTo(next) < 0) {
- max = next;
- }
- }
-
- @Override
- public Iterable<BigInteger> results() {
- return ImmutableList.of(max);
- }
- }
-
- private static class MinLongs extends SimpleAggregator<Long> {
- private Long min = null;
-
- @Override
- public void reset() {
- min = null;
- }
-
- @Override
- public void update(Long next) {
- if (min == null || min > next) {
- min = next;
- }
- }
-
- @Override
- public Iterable<Long> results() {
- return ImmutableList.of(min);
- }
- }
-
- private static class MinInts extends SimpleAggregator<Integer> {
- private Integer min = null;
-
- @Override
- public void reset() {
- min = null;
- }
-
- @Override
- public void update(Integer next) {
- if (min == null || min > next) {
- min = next;
- }
- }
-
- @Override
- public Iterable<Integer> results() {
- return ImmutableList.of(min);
- }
- }
-
- private static class MinFloats extends SimpleAggregator<Float> {
- private Float min = null;
-
- @Override
- public void reset() {
- min = null;
- }
-
- @Override
- public void update(Float next) {
- if (min == null || min > next) {
- min = next;
- }
- }
-
- @Override
- public Iterable<Float> results() {
- return ImmutableList.of(min);
- }
- }
-
- private static class MinDoubles extends SimpleAggregator<Double> {
- private Double min = null;
-
- @Override
- public void reset() {
- min = null;
- }
-
- @Override
- public void update(Double next) {
- if (min == null || min > next) {
- min = next;
- }
- }
-
- @Override
- public Iterable<Double> results() {
- return ImmutableList.of(min);
- }
- }
-
- private static class MinBigInts extends SimpleAggregator<BigInteger> {
- private BigInteger min = null;
-
- @Override
- public void reset() {
- min = null;
- }
-
- @Override
- public void update(BigInteger next) {
- if (min == null || min.compareTo(next) > 0) {
- min = next;
- }
- }
-
- @Override
- public Iterable<BigInteger> results() {
- return ImmutableList.of(min);
- }
- }
-
- private static class MaxNAggregator<V extends Comparable<V>> extends SimpleAggregator<V> {
- private final int arity;
- private transient SortedSet<V> elements;
-
- public MaxNAggregator(int arity) {
- this.arity = arity;
- }
-
- @Override
- public void reset() {
- if (elements == null) {
- elements = Sets.newTreeSet();
- } else {
- elements.clear();
- }
- }
-
- @Override
- public void update(V value) {
- if (elements.size() < arity) {
- elements.add(value);
- } else if (value.compareTo(elements.first()) > 0) {
- elements.remove(elements.first());
- elements.add(value);
- }
- }
-
- @Override
- public Iterable<V> results() {
- return ImmutableList.copyOf(elements);
- }
- }
-
- private static class MinNAggregator<V extends Comparable<V>> extends SimpleAggregator<V> {
- private final int arity;
- private transient SortedSet<V> elements;
-
- public MinNAggregator(int arity) {
- this.arity = arity;
- }
-
- @Override
- public void reset() {
- if (elements == null) {
- elements = Sets.newTreeSet();
- } else {
- elements.clear();
- }
- }
-
- @Override
- public void update(V value) {
- if (elements.size() < arity) {
- elements.add(value);
- } else if (value.compareTo(elements.last()) < 0) {
- elements.remove(elements.last());
- elements.add(value);
- }
- }
-
- @Override
- public Iterable<V> results() {
- return ImmutableList.copyOf(elements);
- }
- }
-
- private static class FirstNAggregator<V> extends SimpleAggregator<V> {
- private final int arity;
- private final List<V> elements;
-
- public FirstNAggregator(int arity) {
- this.arity = arity;
- this.elements = Lists.newArrayList();
- }
-
- @Override
- public void reset() {
- elements.clear();
- }
-
- @Override
- public void update(V value) {
- if (elements.size() < arity) {
- elements.add(value);
- }
- }
-
- @Override
- public Iterable<V> results() {
- return ImmutableList.copyOf(elements);
- }
- }
-
- private static class LastNAggregator<V> extends SimpleAggregator<V> {
- private final int arity;
- private final LinkedList<V> elements;
-
- public LastNAggregator(int arity) {
- this.arity = arity;
- this.elements = Lists.newLinkedList();
- }
-
- @Override
- public void reset() {
- elements.clear();
- }
-
- @Override
- public void update(V value) {
- elements.add(value);
- if (elements.size() == arity + 1) {
- elements.removeFirst();
- }
- }
-
- @Override
- public Iterable<V> results() {
- return ImmutableList.copyOf(elements);
- }
- }
-
- private static class StringConcatAggregator extends SimpleAggregator<String> {
- private final String separator;
- private final boolean skipNulls;
- private final long maxOutputLength;
- private final long maxInputLength;
- private long currentLength;
- private final LinkedList<String> list = new LinkedList<String>();
-
- private transient Joiner joiner;
-
- public StringConcatAggregator(final String separator, final boolean skipNulls) {
- this.separator = separator;
- this.skipNulls = skipNulls;
- this.maxInputLength = 0;
- this.maxOutputLength = 0;
- }
-
- public StringConcatAggregator(final String separator, final boolean skipNull, final long maxOutputLength, final long maxInputLength) {
- this.separator = separator;
- this.skipNulls = skipNull;
- this.maxOutputLength = maxOutputLength;
- this.maxInputLength = maxInputLength;
- this.currentLength = -separator.length();
- }
-
- @Override
- public void reset() {
- if (joiner == null) {
- joiner = skipNulls ? Joiner.on(separator).skipNulls() : Joiner.on(separator);
- }
- currentLength = -separator.length();
- list.clear();
- }
-
- @Override
- public void update(final String next) {
- long length = (next == null) ? 0 : next.length() + separator.length();
- if (maxOutputLength > 0 && currentLength + length > maxOutputLength || maxInputLength > 0 && next.length() > maxInputLength) {
- return;
- }
- if (maxOutputLength > 0) {
- currentLength += length;
- }
- list.add(next);
- }
-
- @Override
- public Iterable<String> results() {
- return ImmutableList.of(joiner.join(list));
- }
- }
-
-
- private static abstract class TupleAggregator<T> implements Aggregator<T> {
- private final List<Aggregator<Object>> aggregators;
-
- @SuppressWarnings("unchecked")
- public TupleAggregator(Aggregator<?>... aggregators) {
- this.aggregators = Lists.newArrayList();
- for (Aggregator<?> a : aggregators) {
- this.aggregators.add((Aggregator<Object>) a);
- }
- }
-
- @Override
- public void initialize(Configuration configuration) {
- for (Aggregator<?> a : aggregators) {
- a.initialize(configuration);
- }
- }
-
- @Override
- public void reset() {
- for (Aggregator<?> a : aggregators) {
- a.reset();
- }
- }
-
- protected void updateTuple(Tuple t) {
- for (int i = 0; i < aggregators.size(); i++) {
- aggregators.get(i).update(t.get(i));
- }
- }
-
- protected Iterable<Object> results(int index) {
- return aggregators.get(index).results();
- }
- }
-
- private static class PairAggregator<V1, V2> extends TupleAggregator<Pair<V1, V2>> {
-
- public PairAggregator(Aggregator<V1> a1, Aggregator<V2> a2) {
- super(a1, a2);
- }
-
- @Override
- public void update(Pair<V1, V2> value) {
- updateTuple(value);
- }
-
- @SuppressWarnings("unchecked")
- @Override
- public Iterable<Pair<V1, V2>> results() {
- return new Tuples.PairIterable<V1, V2>((Iterable<V1>) results(0), (Iterable<V2>) results(1));
- }
- }
-
- private static class TripAggregator<A, B, C> extends TupleAggregator<Tuple3<A, B, C>> {
-
- public TripAggregator(Aggregator<A> a1, Aggregator<B> a2, Aggregator<C> a3) {
- super(a1, a2, a3);
- }
-
- @Override
- public void update(Tuple3<A, B, C> value) {
- updateTuple(value);
- }
-
- @SuppressWarnings("unchecked")
- @Override
- public Iterable<Tuple3<A, B, C>> results() {
- return new Tuples.TripIterable<A, B, C>((Iterable<A>) results(0), (Iterable<B>) results(1),
- (Iterable<C>) results(2));
- }
- }
-
- private static class QuadAggregator<A, B, C, D> extends TupleAggregator<Tuple4<A, B, C, D>> {
-
- public QuadAggregator(Aggregator<A> a1, Aggregator<B> a2, Aggregator<C> a3, Aggregator<D> a4) {
- super(a1, a2, a3, a4);
- }
-
- @Override
- public void update(Tuple4<A, B, C, D> value) {
- updateTuple(value);
- }
-
- @SuppressWarnings("unchecked")
- @Override
- public Iterable<Tuple4<A, B, C, D>> results() {
- return new Tuples.QuadIterable<A, B, C, D>((Iterable<A>) results(0), (Iterable<B>) results(1),
- (Iterable<C>) results(2), (Iterable<D>) results(3));
- }
- }
-
- private static class TupleNAggregator extends TupleAggregator<TupleN> {
- private final int size;
-
- public TupleNAggregator(Aggregator<?>... aggregators) {
- super(aggregators);
- size = aggregators.length;
- }
-
- @Override
- public void update(TupleN value) {
- updateTuple(value);
- }
-
- @Override
- public Iterable<TupleN> results() {
- Iterable<?>[] iterables = new Iterable[size];
- for (int i = 0; i < size; i++) {
- iterables[i] = results(i);
- }
- return new Tuples.TupleNIterable(iterables);
- }
- }
-
- private static class SetAggregator<V> extends SimpleAggregator<V> {
- private final Set<V> elements;
- private final int sizeLimit;
-
- public SetAggregator() {
- this(-1);
- }
-
- public SetAggregator(int sizeLimit) {
- this.elements = Sets.newHashSet();
- this.sizeLimit = sizeLimit;
- }
-
- @Override
- public void reset() {
- elements.clear();
- }
-
- @Override
- public void update(V value) {
- if (sizeLimit == -1 || elements.size() < sizeLimit) {
- elements.add(value);
- }
- }
-
- @Override
- public Iterable<V> results() {
- return ImmutableList.copyOf(elements);
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/fn/CompositeMapFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/fn/CompositeMapFn.java b/crunch/src/main/java/org/apache/crunch/fn/CompositeMapFn.java
deleted file mode 100644
index 2a8e7d9..0000000
--- a/crunch/src/main/java/org/apache/crunch/fn/CompositeMapFn.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import org.apache.crunch.Emitter;
-import org.apache.crunch.MapFn;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-
-public class CompositeMapFn<R, S, T> extends MapFn<R, T> {
-
- private final MapFn<R, S> first;
- private final MapFn<S, T> second;
-
- public CompositeMapFn(MapFn<R, S> first, MapFn<S, T> second) {
- this.first = first;
- this.second = second;
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- first.setContext(context);
- second.setContext(context);
- }
-
- @Override
- public void initialize() {
- first.initialize();
- second.initialize();
- }
-
- public MapFn<R, S> getFirst() {
- return first;
- }
-
- public MapFn<S, T> getSecond() {
- return second;
- }
-
- @Override
- public T map(R input) {
- return second.map(first.map(input));
- }
-
- @Override
- public void cleanup(Emitter<T> emitter) {
- first.cleanup(null);
- second.cleanup(null);
- }
-
- @Override
- public void configure(Configuration conf) {
- first.configure(conf);
- second.configure(conf);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/fn/ExtractKeyFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/fn/ExtractKeyFn.java b/crunch/src/main/java/org/apache/crunch/fn/ExtractKeyFn.java
deleted file mode 100644
index b8cc9df..0000000
--- a/crunch/src/main/java/org/apache/crunch/fn/ExtractKeyFn.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Pair;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-
-/**
- * Wrapper function for converting a {@code MapFn} into a key-value pair that is
- * used to convert from a {@code PCollection<V>} to a {@code PTable<K, V>}.
- */
-public class ExtractKeyFn<K, V> extends MapFn<V, Pair<K, V>> {
-
- private final MapFn<V, K> mapFn;
-
- public ExtractKeyFn(MapFn<V, K> mapFn) {
- this.mapFn = mapFn;
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- mapFn.setContext(context);
- }
-
- @Override
- public void initialize() {
- mapFn.initialize();
- }
-
- @Override
- public Pair<K, V> map(V input) {
- return Pair.of(mapFn.map(input), input);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/fn/FilterFns.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/fn/FilterFns.java b/crunch/src/main/java/org/apache/crunch/fn/FilterFns.java
deleted file mode 100644
index 8dc4268..0000000
--- a/crunch/src/main/java/org/apache/crunch/fn/FilterFns.java
+++ /dev/null
@@ -1,112 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import org.apache.crunch.FilterFn;
-import org.apache.crunch.FilterFn.AndFn;
-import org.apache.crunch.FilterFn.NotFn;
-import org.apache.crunch.FilterFn.OrFn;
-
-
-/**
- * A collection of pre-defined {@link FilterFn} implementations.
- */
-public final class FilterFns {
- // Note: We delegate to the deprecated implementation classes in FilterFn. When their
- // time is up, we just move them here.
-
- private FilterFns() {
- // utility class, not for instantiation
- }
-
- /**
- * Accept an entry if all of the given filters accept it, using short-circuit evaluation.
- * @param fn1 The first functions to delegate to
- * @param fn2 The second functions to delegate to
- * @return The composed filter function
- */
- public static <S> FilterFn<S> and(FilterFn<S> fn1, FilterFn<S> fn2) {
- return new AndFn<S>(fn1, fn2);
- }
-
- /**
- * Accept an entry if all of the given filters accept it, using short-circuit evaluation.
- * @param fns The functions to delegate to (in the given order)
- * @return The composed filter function
- */
- public static <S> FilterFn<S> and(FilterFn<S>... fns) {
- return new AndFn<S>(fns);
- }
-
- /**
- * Accept an entry if at least one of the given filters accept it, using short-circuit evaluation.
- * @param fn1 The first functions to delegate to
- * @param fn2 The second functions to delegate to
- * @return The composed filter function
- */
- public static <S> FilterFn<S> or(FilterFn<S> fn1, FilterFn<S> fn2) {
- return new OrFn<S>(fn1, fn2);
- }
-
- /**
- * Accept an entry if at least one of the given filters accept it, using short-circuit evaluation.
- * @param fns The functions to delegate to (in the given order)
- * @return The composed filter function
- */
- public static <S> FilterFn<S> or(FilterFn<S>... fns) {
- return new OrFn<S>(fns);
- }
-
- /**
- * Accept an entry if the given filter <em>does not</em> accept it.
- * @param fn The function to delegate to
- * @return The composed filter function
- */
- public static <S> FilterFn<S> not(FilterFn<S> fn) {
- return new NotFn<S>(fn);
- }
-
- /**
- * Accept everything.
- * @return A filter function that accepts everything.
- */
- public static <S> FilterFn<S> ACCEPT_ALL() {
- return new AcceptAllFn<S>();
- }
-
- /**
- * Reject everything.
- * @return A filter function that rejects everything.
- */
- public static <S> FilterFn<S> REJECT_ALL() {
- return not(new AcceptAllFn<S>());
- }
-
- private static class AcceptAllFn<S> extends FilterFn<S> {
- @Override
- public boolean accept(S input) {
- return true;
- }
-
- @Override
- public float scaleFactor() {
- return 1.0f;
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/fn/IdentityFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/fn/IdentityFn.java b/crunch/src/main/java/org/apache/crunch/fn/IdentityFn.java
deleted file mode 100644
index 0eadb06..0000000
--- a/crunch/src/main/java/org/apache/crunch/fn/IdentityFn.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import org.apache.crunch.MapFn;
-
-public class IdentityFn<T> extends MapFn<T, T> {
-
- private static final IdentityFn<Object> INSTANCE = new IdentityFn<Object>();
-
- @SuppressWarnings("unchecked")
- public static <T> IdentityFn<T> getInstance() {
- return (IdentityFn<T>) INSTANCE;
- }
-
- // Non-instantiable
- private IdentityFn() {
- }
-
- @Override
- public T map(T input) {
- return input;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/fn/MapKeysFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/fn/MapKeysFn.java b/crunch/src/main/java/org/apache/crunch/fn/MapKeysFn.java
deleted file mode 100644
index cbaf24d..0000000
--- a/crunch/src/main/java/org/apache/crunch/fn/MapKeysFn.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.Pair;
-
-public abstract class MapKeysFn<K1, K2, V> extends DoFn<Pair<K1, V>, Pair<K2, V>> {
-
- @Override
- public void process(Pair<K1, V> input, Emitter<Pair<K2, V>> emitter) {
- emitter.emit(Pair.of(map(input.first()), input.second()));
- }
-
- public abstract K2 map(K1 k1);
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/fn/MapValuesFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/fn/MapValuesFn.java b/crunch/src/main/java/org/apache/crunch/fn/MapValuesFn.java
deleted file mode 100644
index b90f5ff..0000000
--- a/crunch/src/main/java/org/apache/crunch/fn/MapValuesFn.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.Pair;
-
-public abstract class MapValuesFn<K, V1, V2> extends DoFn<Pair<K, V1>, Pair<K, V2>> {
-
- @Override
- public void process(Pair<K, V1> input, Emitter<Pair<K, V2>> emitter) {
- emitter.emit(Pair.of(input.first(), map(input.second())));
- }
-
- public abstract V2 map(V1 v);
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/fn/PairMapFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/fn/PairMapFn.java b/crunch/src/main/java/org/apache/crunch/fn/PairMapFn.java
deleted file mode 100644
index 9ee4336..0000000
--- a/crunch/src/main/java/org/apache/crunch/fn/PairMapFn.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.fn;
-
-import org.apache.crunch.Emitter;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Pair;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-
-public class PairMapFn<K, V, S, T> extends MapFn<Pair<K, V>, Pair<S, T>> {
-
- private MapFn<K, S> keys;
- private MapFn<V, T> values;
-
- public PairMapFn(MapFn<K, S> keys, MapFn<V, T> values) {
- this.keys = keys;
- this.values = values;
- }
-
- @Override
- public void configure(Configuration conf) {
- keys.configure(conf);
- values.configure(conf);
- }
-
- @Override
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- keys.setContext(context);
- values.setContext(context);
- }
-
- @Override
- public void initialize() {
- keys.initialize();
- values.initialize();
- }
-
- @Override
- public Pair<S, T> map(Pair<K, V> input) {
- return Pair.of(keys.map(input.first()), values.map(input.second()));
- }
-
- @Override
- public void cleanup(Emitter<Pair<S, T>> emitter) {
- keys.cleanup(null);
- values.cleanup(null);
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/fn/package-info.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/fn/package-info.java b/crunch/src/main/java/org/apache/crunch/fn/package-info.java
deleted file mode 100644
index acefdff..0000000
--- a/crunch/src/main/java/org/apache/crunch/fn/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Commonly used functions for manipulating collections.
- */
-package org.apache.crunch.fn;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/hadoop/mapreduce/TaskAttemptContextFactory.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/hadoop/mapreduce/TaskAttemptContextFactory.java b/crunch/src/main/java/org/apache/crunch/hadoop/mapreduce/TaskAttemptContextFactory.java
deleted file mode 100644
index 887c051..0000000
--- a/crunch/src/main/java/org/apache/crunch/hadoop/mapreduce/TaskAttemptContextFactory.java
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.hadoop.mapreduce;
-
-import java.lang.reflect.Constructor;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.TaskAttemptID;
-
-/**
- * A factory class that allows us to hide the fact that {@code TaskAttemptContext} is a class in
- * Hadoop 1.x.x and an interface in Hadoop 2.x.x.
- */
-@SuppressWarnings("unchecked")
-public class TaskAttemptContextFactory {
-
- private static final Log LOG = LogFactory.getLog(TaskAttemptContextFactory.class);
-
- private static final TaskAttemptContextFactory INSTANCE = new TaskAttemptContextFactory();
-
- public static TaskAttemptContext create(Configuration conf, TaskAttemptID taskAttemptId) {
- return INSTANCE.createInternal(conf, taskAttemptId);
- }
-
- private Constructor<TaskAttemptContext> taskAttemptConstructor;
-
- private TaskAttemptContextFactory() {
- Class<TaskAttemptContext> implClass = TaskAttemptContext.class;
- if (implClass.isInterface()) {
- try {
- implClass = (Class<TaskAttemptContext>) Class.forName(
- "org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl");
- } catch (ClassNotFoundException e) {
- LOG.fatal("Could not find TaskAttemptContextImpl class, exiting", e);
- }
- }
- try {
- this.taskAttemptConstructor = implClass.getConstructor(Configuration.class, TaskAttemptID.class);
- } catch (Exception e) {
- LOG.fatal("Could not access TaskAttemptContext constructor, exiting", e);
- }
- }
-
- private TaskAttemptContext createInternal(Configuration conf, TaskAttemptID taskAttemptId) {
- try {
- return (TaskAttemptContext) taskAttemptConstructor.newInstance(conf, taskAttemptId);
- } catch (Exception e) {
- LOG.error("Could not construct a TaskAttemptContext instance", e);
- return null;
- }
- }
-}
[30/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/impl/SourceTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/impl/SourceTargetImpl.java b/crunch-core/src/main/java/org/apache/crunch/io/impl/SourceTargetImpl.java
new file mode 100644
index 0000000..4d2b88a
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/impl/SourceTargetImpl.java
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.impl;
+
+import java.io.IOException;
+
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.crunch.Source;
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.Target;
+import org.apache.crunch.io.OutputHandler;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.Job;
+
+class SourceTargetImpl<T> implements SourceTarget<T> {
+
+ protected final Source<T> source;
+ protected final Target target;
+
+ public SourceTargetImpl(Source<T> source, Target target) {
+ this.source = source;
+ this.target = target;
+ }
+
+ @Override
+ public PType<T> getType() {
+ return source.getType();
+ }
+
+ @Override
+ public void configureSource(Job job, int inputId) throws IOException {
+ source.configureSource(job, inputId);
+ }
+
+ @Override
+ public long getSize(Configuration configuration) {
+ return source.getSize(configuration);
+ }
+
+ @Override
+ public boolean accept(OutputHandler handler, PType<?> ptype) {
+ return target.accept(handler, ptype);
+ }
+
+ @Override
+ public <S> SourceTarget<S> asSourceTarget(PType<S> ptype) {
+ return target.asSourceTarget(ptype);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == null || !(other.getClass().equals(getClass()))) {
+ return false;
+ }
+ SourceTargetImpl sti = (SourceTargetImpl) other;
+ return source.equals(sti.source) && target.equals(sti.target);
+ }
+
+ @Override
+ public int hashCode() {
+ return new HashCodeBuilder().append(source).append(target).toHashCode();
+ }
+
+ @Override
+ public String toString() {
+ return source.toString();
+ }
+
+ @Override
+ public void handleExisting(WriteMode strategy, Configuration conf) {
+ target.handleExisting(strategy, conf);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/impl/TableSourcePathTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/impl/TableSourcePathTargetImpl.java b/crunch-core/src/main/java/org/apache/crunch/io/impl/TableSourcePathTargetImpl.java
new file mode 100644
index 0000000..a8ff639
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/impl/TableSourcePathTargetImpl.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.impl;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.TableSource;
+import org.apache.crunch.io.FileNamingScheme;
+import org.apache.crunch.io.PathTarget;
+import org.apache.crunch.io.SequentialFileNamingScheme;
+import org.apache.crunch.types.PTableType;
+
+public class TableSourcePathTargetImpl<K, V> extends SourcePathTargetImpl<Pair<K, V>> implements TableSource<K, V> {
+
+ public TableSourcePathTargetImpl(TableSource<K, V> source, PathTarget target) {
+ this(source, target, new SequentialFileNamingScheme());
+ }
+
+ public TableSourcePathTargetImpl(TableSource<K, V> source, PathTarget target, FileNamingScheme fileNamingScheme) {
+ super(source, target, fileNamingScheme);
+ }
+
+ @Override
+ public PTableType<K, V> getTableType() {
+ return ((TableSource<K, V>) source).getTableType();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/impl/TableSourceTargetImpl.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/impl/TableSourceTargetImpl.java b/crunch-core/src/main/java/org/apache/crunch/io/impl/TableSourceTargetImpl.java
new file mode 100644
index 0000000..965b0f9
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/impl/TableSourceTargetImpl.java
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.impl;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.TableSource;
+import org.apache.crunch.Target;
+import org.apache.crunch.types.PTableType;
+
+public class TableSourceTargetImpl<K, V> extends SourceTargetImpl<Pair<K, V>> implements TableSource<K, V> {
+
+ public TableSourceTargetImpl(TableSource<K, V> source, Target target) {
+ super(source, target);
+ }
+
+ @Override
+ public PTableType<K, V> getTableType() {
+ return ((TableSource<K, V>) source).getTableType();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/package-info.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/package-info.java b/crunch-core/src/main/java/org/apache/crunch/io/package-info.java
new file mode 100644
index 0000000..022bc99
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/package-info.java
@@ -0,0 +1,22 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Data input and output for Pipelines.
+ */
+package org.apache.crunch.io;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileHelper.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileHelper.java b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileHelper.java
new file mode 100644
index 0000000..ba07506
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileHelper.java
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.seq;
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.writable.WritableType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ReflectionUtils;
+
+class SeqFileHelper {
+ static <T> Writable newInstance(PType<T> ptype, Configuration conf) {
+ return (Writable) ReflectionUtils.newInstance(((WritableType) ptype).getSerializationClass(), conf);
+ }
+
+ static <T> MapFn<Object, T> getInputMapFn(PType<T> ptype) {
+ return ptype.getInputMapFn();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileReaderFactory.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileReaderFactory.java b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileReaderFactory.java
new file mode 100644
index 0000000..3f45644
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileReaderFactory.java
@@ -0,0 +1,112 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.seq;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.fn.IdentityFn;
+import org.apache.crunch.io.FileReaderFactory;
+import org.apache.crunch.io.impl.AutoClosingIterator;
+import org.apache.crunch.types.Converter;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import com.google.common.collect.Iterators;
+import com.google.common.collect.UnmodifiableIterator;
+
+public class SeqFileReaderFactory<T> implements FileReaderFactory<T> {
+
+ private static final Log LOG = LogFactory.getLog(SeqFileReaderFactory.class);
+
+ private final Converter converter;
+ private final MapFn<Object, T> mapFn;
+ private final Writable key;
+ private final Writable value;
+
+ public SeqFileReaderFactory(PType<T> ptype) {
+ this.converter = ptype.getConverter();
+ this.mapFn = ptype.getInputMapFn();
+ if (ptype instanceof PTableType) {
+ PTableType ptt = (PTableType) ptype;
+ this.key = SeqFileHelper.newInstance(ptt.getKeyType(), null);
+ this.value = SeqFileHelper.newInstance(ptt.getValueType(), null);
+ } else {
+ this.key = NullWritable.get();
+ this.value = SeqFileHelper.newInstance(ptype, null);
+ }
+ }
+
+ public SeqFileReaderFactory(Class clazz) {
+ PType<T> ptype = Writables.writables(clazz);
+ this.converter = ptype.getConverter();
+ this.mapFn = ptype.getInputMapFn();
+ this.key = NullWritable.get();
+ this.value = (Writable) ReflectionUtils.newInstance(clazz, null);
+ }
+
+ @Override
+ public Iterator<T> read(FileSystem fs, final Path path) {
+ mapFn.initialize();
+ try {
+ final SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());
+ return new AutoClosingIterator<T>(reader, new UnmodifiableIterator<T>() {
+ boolean nextChecked = false;
+ boolean hasNext = false;
+
+ @Override
+ public boolean hasNext() {
+ if (nextChecked == true) {
+ return hasNext;
+ }
+ try {
+ hasNext = reader.next(key, value);
+ nextChecked = true;
+ return hasNext;
+ } catch (IOException e) {
+ LOG.info("Error reading from path: " + path, e);
+ return false;
+ }
+ }
+
+ @Override
+ public T next() {
+ if (!nextChecked && !hasNext()) {
+ return null;
+ }
+ nextChecked = false;
+ return mapFn.map(converter.convertInput(key, value));
+ }
+ });
+ } catch (IOException e) {
+ LOG.info("Could not read seqfile at path: " + path, e);
+ return Iterators.emptyIterator();
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileSource.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileSource.java b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileSource.java
new file mode 100644
index 0000000..8fac4ae
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileSource.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.seq;
+
+import java.io.IOException;
+
+import org.apache.crunch.io.CompositePathIterable;
+import org.apache.crunch.io.ReadableSource;
+import org.apache.crunch.io.impl.FileSourceImpl;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+
+public class SeqFileSource<T> extends FileSourceImpl<T> implements ReadableSource<T> {
+
+ public SeqFileSource(Path path, PType<T> ptype) {
+ super(path, ptype, SequenceFileInputFormat.class);
+ }
+
+ @Override
+ public Iterable<T> read(Configuration conf) throws IOException {
+ FileSystem fs = path.getFileSystem(conf);
+ return CompositePathIterable.create(fs, path, new SeqFileReaderFactory<T>(ptype));
+ }
+
+ @Override
+ public String toString() {
+ return "SeqFile(" + path.toString() + ")";
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileSourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileSourceTarget.java b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileSourceTarget.java
new file mode 100644
index 0000000..adc739f
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileSourceTarget.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.seq;
+
+import org.apache.crunch.io.FileNamingScheme;
+import org.apache.crunch.io.SequentialFileNamingScheme;
+import org.apache.crunch.io.impl.ReadableSourcePathTargetImpl;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.fs.Path;
+
+public class SeqFileSourceTarget<T> extends ReadableSourcePathTargetImpl<T> {
+
+ public SeqFileSourceTarget(String path, PType<T> ptype) {
+ this(new Path(path), ptype);
+ }
+
+ public SeqFileSourceTarget(Path path, PType<T> ptype) {
+ this(path, ptype, new SequentialFileNamingScheme());
+ }
+
+ public SeqFileSourceTarget(Path path, PType<T> ptype, FileNamingScheme fileNamingScheme) {
+ super(new SeqFileSource<T>(path, ptype), new SeqFileTarget(path), fileNamingScheme);
+ }
+
+ @Override
+ public String toString() {
+ return target.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileTableSource.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileTableSource.java b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileTableSource.java
new file mode 100644
index 0000000..7a63272
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileTableSource.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.seq;
+
+import java.io.IOException;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.io.CompositePathIterable;
+import org.apache.crunch.io.ReadableSource;
+import org.apache.crunch.io.impl.FileTableSourceImpl;
+import org.apache.crunch.types.PTableType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+
+/**
+ * A {@code TableSource} that uses {@code SequenceFileInputFormat} to read the input
+ * file.
+ */
+public class SeqFileTableSource<K, V> extends FileTableSourceImpl<K, V> implements ReadableSource<Pair<K, V>> {
+
+ public SeqFileTableSource(String path, PTableType<K, V> ptype) {
+ this(new Path(path), ptype);
+ }
+
+ public SeqFileTableSource(Path path, PTableType<K, V> ptype) {
+ super(path, ptype, SequenceFileInputFormat.class);
+ }
+
+ @Override
+ public Iterable<Pair<K, V>> read(Configuration conf) throws IOException {
+ FileSystem fs = path.getFileSystem(conf);
+ return CompositePathIterable.create(fs, path,
+ new SeqFileReaderFactory<Pair<K, V>>(getTableType()));
+ }
+
+ @Override
+ public String toString() {
+ return "SeqFile(" + path.toString() + ")";
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileTableSourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileTableSourceTarget.java b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileTableSourceTarget.java
new file mode 100644
index 0000000..ebdf319
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileTableSourceTarget.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.seq;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.TableSourceTarget;
+import org.apache.crunch.io.FileNamingScheme;
+import org.apache.crunch.io.SequentialFileNamingScheme;
+import org.apache.crunch.io.impl.ReadableSourcePathTargetImpl;
+import org.apache.crunch.types.PTableType;
+import org.apache.hadoop.fs.Path;
+
+public class SeqFileTableSourceTarget<K, V> extends ReadableSourcePathTargetImpl<Pair<K, V>> implements
+ TableSourceTarget<K, V> {
+ private final PTableType<K, V> tableType;
+
+ public SeqFileTableSourceTarget(String path, PTableType<K, V> tableType) {
+ this(new Path(path), tableType);
+ }
+
+ public SeqFileTableSourceTarget(Path path, PTableType<K, V> tableType) {
+ this(path, tableType, new SequentialFileNamingScheme());
+ }
+
+ public SeqFileTableSourceTarget(Path path, PTableType<K, V> tableType, FileNamingScheme fileNamingScheme) {
+ super(new SeqFileTableSource<K, V>(path, tableType), new SeqFileTarget(path), fileNamingScheme);
+ this.tableType = tableType;
+ }
+
+ @Override
+ public PTableType<K, V> getTableType() {
+ return tableType;
+ }
+
+ @Override
+ public String toString() {
+ return target.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileTarget.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileTarget.java b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileTarget.java
new file mode 100644
index 0000000..60e4739
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/seq/SeqFileTarget.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.seq;
+
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.io.FileNamingScheme;
+import org.apache.crunch.io.SequentialFileNamingScheme;
+import org.apache.crunch.io.impl.FileTargetImpl;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+
+public class SeqFileTarget extends FileTargetImpl {
+ public SeqFileTarget(String path) {
+ this(new Path(path));
+ }
+
+ public SeqFileTarget(Path path) {
+ this(path, new SequentialFileNamingScheme());
+ }
+
+ public SeqFileTarget(Path path, FileNamingScheme fileNamingScheme) {
+ super(path, SequenceFileOutputFormat.class, fileNamingScheme);
+ }
+
+ @Override
+ public String toString() {
+ return "SeqFile(" + path.toString() + ")";
+ }
+
+ @Override
+ public <T> SourceTarget<T> asSourceTarget(PType<T> ptype) {
+ if (ptype instanceof PTableType) {
+ return new SeqFileTableSourceTarget(path, (PTableType) ptype);
+ } else {
+ return new SeqFileSourceTarget(path, ptype);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/text/BZip2TextInputFormat.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/BZip2TextInputFormat.java b/crunch-core/src/main/java/org/apache/crunch/io/text/BZip2TextInputFormat.java
new file mode 100644
index 0000000..67a8870
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/text/BZip2TextInputFormat.java
@@ -0,0 +1,235 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.text;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+
+class BZip2TextInputFormat extends FileInputFormat<LongWritable, Text> {
+ /**
+ * Treats keys as offset in file and value as line. Since the input file is
+ * compressed, the offset for a particular line is not well-defined. This
+ * implementation returns the starting position of a compressed block as the
+ * key for every line in that block.
+ */
+
+ private static class BZip2LineRecordReader extends RecordReader<LongWritable, Text> {
+
+ private long start;
+
+ private long end;
+
+ private long pos;
+
+ private CBZip2InputStream in;
+
+ private ByteArrayOutputStream buffer = new ByteArrayOutputStream(256);
+
+ // flag to indicate if previous character read was Carriage Return ('\r')
+ // and the next character was not Line Feed ('\n')
+ private boolean CRFollowedByNonLF = false;
+
+ // in the case where a Carriage Return ('\r') was not followed by a
+ // Line Feed ('\n'), this variable will hold that non Line Feed character
+ // that was read from the underlying stream.
+ private byte nonLFChar;
+
+ /**
+ * Provide a bridge to get the bytes from the ByteArrayOutputStream without
+ * creating a new byte array.
+ */
+ private static class TextStuffer extends OutputStream {
+ public Text target;
+
+ @Override
+ public void write(int b) {
+ throw new UnsupportedOperationException("write(byte) not supported");
+ }
+
+ @Override
+ public void write(byte[] data, int offset, int len) throws IOException {
+ target.clear();
+ target.set(data, offset, len);
+ }
+ }
+
+ private TextStuffer bridge = new TextStuffer();
+
+ private LongWritable key = new LongWritable();
+ private Text value = new Text();
+
+ public BZip2LineRecordReader(Configuration job, FileSplit split) throws IOException {
+ start = split.getStart();
+ end = start + split.getLength();
+ final Path file = split.getPath();
+
+ // open the file and seek to the start of the split
+ FileSystem fs = file.getFileSystem(job);
+ FSDataInputStream fileIn = fs.open(split.getPath());
+ fileIn.seek(start);
+
+ in = new CBZip2InputStream(fileIn, 9, end);
+ if (start != 0) {
+ // skip first line and re-establish "start".
+ // LineRecordReader.readLine(this.in, null);
+ readLine(this.in, null);
+ start = in.getPos();
+ }
+ pos = in.getPos();
+ }
+
+ /*
+ * LineRecordReader.readLine() is depricated in HAdoop 0.17. So it is added
+ * here locally.
+ */
+ private long readLine(InputStream in, OutputStream out) throws IOException {
+ long bytes = 0;
+ while (true) {
+ int b = -1;
+ if (CRFollowedByNonLF) {
+ // In the previous call, a Carriage Return ('\r') was followed
+ // by a non Line Feed ('\n') character - in that call we would
+ // have not returned the non Line Feed character but would have
+ // read it from the stream - lets use that already read character
+ // now
+ b = nonLFChar;
+ CRFollowedByNonLF = false;
+ } else {
+ b = in.read();
+ }
+ if (b == -1) {
+ break;
+ }
+ bytes += 1;
+
+ byte c = (byte) b;
+ if (c == '\n') {
+ break;
+ }
+
+ if (c == '\r') {
+ byte nextC = (byte) in.read();
+ if (nextC != '\n') {
+ CRFollowedByNonLF = true;
+ nonLFChar = nextC;
+ } else {
+ bytes += 1;
+ }
+ break;
+ }
+
+ if (out != null) {
+ out.write(c);
+ }
+ }
+ return bytes;
+ }
+
+ /** Read a line. */
+ public boolean next(LongWritable key, Text value) throws IOException {
+ if (pos > end)
+ return false;
+
+ key.set(pos); // key is position
+ buffer.reset();
+ // long bytesRead = LineRecordReader.readLine(in, buffer);
+ long bytesRead = readLine(in, buffer);
+ if (bytesRead == 0) {
+ return false;
+ }
+ pos = in.getPos();
+ // if we have read ahead because we encountered a carriage return
+ // char followed by a non line feed char, decrement the pos
+ if (CRFollowedByNonLF) {
+ pos--;
+ }
+
+ bridge.target = value;
+ buffer.writeTo(bridge);
+ return true;
+ }
+
+ /**
+ * Get the progress within the split
+ */
+ @Override
+ public float getProgress() {
+ if (start == end) {
+ return 0.0f;
+ } else {
+ return Math.min(1.0f, (pos - start) / (float) (end - start));
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ in.close();
+ }
+
+ @Override
+ public LongWritable getCurrentKey() throws IOException, InterruptedException {
+ return key;
+ }
+
+ @Override
+ public Text getCurrentValue() throws IOException, InterruptedException {
+ return value;
+ }
+
+ @Override
+ public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
+ // no op
+ }
+
+ @Override
+ public boolean nextKeyValue() throws IOException, InterruptedException {
+ return next(key, value);
+ }
+
+ }
+
+ @Override
+ protected boolean isSplitable(JobContext context, Path file) {
+ return true;
+ }
+
+ @Override
+ public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
+ try {
+ return new BZip2LineRecordReader(context.getConfiguration(), (FileSplit) split);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/text/CBZip2InputStream.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/CBZip2InputStream.java b/crunch-core/src/main/java/org/apache/crunch/io/text/CBZip2InputStream.java
new file mode 100644
index 0000000..92bb787
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/text/CBZip2InputStream.java
@@ -0,0 +1,980 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.text;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.io.compress.bzip2.BZip2Constants;
+import org.apache.hadoop.mapreduce.InputSplit;
+
+/**
+ * An input stream that decompresses from the BZip2 format (without the file
+ * header chars) to be read as any other stream.
+ *
+ * @author <a href="mailto:keiron@aftexsw.com">Keiron Liddle</a>
+ */
+class CBZip2InputStream extends InputStream implements BZip2Constants {
+ private static void cadvise(String reason) throws IOException {
+ throw new IOException(reason);
+ }
+
+ private static void compressedStreamEOF() throws IOException {
+ cadvise("compressedStream EOF");
+ }
+
+ private void makeMaps() {
+ int i;
+ nInUse = 0;
+ for (i = 0; i < 256; i++) {
+ if (inUse[i]) {
+ seqToUnseq[nInUse] = (char) i;
+ unseqToSeq[i] = (char) nInUse;
+ nInUse++;
+ }
+ }
+ }
+
+ /*
+ * index of the last char in the block, so the block size == last + 1.
+ */
+ private int last;
+
+ /*
+ * index in zptr[] of original string after sorting.
+ */
+ private int origPtr;
+
+ /*
+ * always: in the range 0 .. 9. The current block size is 100000 * this
+ * number.
+ */
+ private int blockSize100k;
+
+ private boolean blockRandomised;
+
+ // a buffer to keep the read byte
+ private int bsBuff;
+
+ // since bzip is bit-aligned at block boundaries there can be a case wherein
+ // only few bits out of a read byte are consumed and the remaining bits
+ // need to be consumed while processing the next block.
+ // indicate how many bits in bsBuff have not been processed yet
+ private int bsLive;
+ private CRC mCrc = new CRC();
+
+ private boolean[] inUse = new boolean[256];
+ private int nInUse;
+
+ private char[] seqToUnseq = new char[256];
+ private char[] unseqToSeq = new char[256];
+
+ private char[] selector = new char[MAX_SELECTORS];
+ private char[] selectorMtf = new char[MAX_SELECTORS];
+
+ private int[] tt;
+ private char[] ll8;
+
+ /*
+ * freq table collected to save a pass over the data during decompression.
+ */
+ private int[] unzftab = new int[256];
+
+ private int[][] limit = new int[N_GROUPS][MAX_ALPHA_SIZE];
+ private int[][] base = new int[N_GROUPS][MAX_ALPHA_SIZE];
+ private int[][] perm = new int[N_GROUPS][MAX_ALPHA_SIZE];
+ private int[] minLens = new int[N_GROUPS];
+
+ private FSDataInputStream innerBsStream;
+ long readLimit = Long.MAX_VALUE;
+
+ public long getReadLimit() {
+ return readLimit;
+ }
+
+ public void setReadLimit(long readLimit) {
+ this.readLimit = readLimit;
+ }
+
+ long readCount;
+
+ public long getReadCount() {
+ return readCount;
+ }
+
+ private boolean streamEnd = false;
+
+ private int currentChar = -1;
+
+ private static final int START_BLOCK_STATE = 1;
+ private static final int RAND_PART_A_STATE = 2;
+ private static final int RAND_PART_B_STATE = 3;
+ private static final int RAND_PART_C_STATE = 4;
+ private static final int NO_RAND_PART_A_STATE = 5;
+ private static final int NO_RAND_PART_B_STATE = 6;
+ private static final int NO_RAND_PART_C_STATE = 7;
+
+ private int currentState = START_BLOCK_STATE;
+
+ private int storedBlockCRC, storedCombinedCRC;
+ private int computedBlockCRC, computedCombinedCRC;
+ private boolean checkComputedCombinedCRC = true;
+
+ int i2, count, chPrev, ch2;
+ int i, tPos;
+ int rNToGo = 0;
+ int rTPos = 0;
+ int j2;
+ char z;
+
+ // see comment in getPos()
+ private long retPos = -1;
+ // the position offset which corresponds to the end of the InputSplit that
+ // will be processed by this instance
+ private long endOffsetOfSplit;
+
+ private boolean signalToStopReading;
+
+ public CBZip2InputStream(FSDataInputStream zStream, int blockSize, long end) throws IOException {
+ endOffsetOfSplit = end;
+ // initialize retPos to the beginning of the current InputSplit
+ // see comments in getPos() to understand how this is used.
+ retPos = zStream.getPos();
+ ll8 = null;
+ tt = null;
+ checkComputedCombinedCRC = blockSize == -1;
+ bsSetStream(zStream);
+ initialize(blockSize);
+ initBlock(blockSize != -1);
+ setupBlock();
+ }
+
+ @Override
+ public int read() throws IOException {
+ if (streamEnd) {
+ return -1;
+ } else {
+
+ // if we just started reading a bzip block which starts at a position
+ // >= end of current split, then we should set up retpos such that
+ // after a record is read, future getPos() calls will get a value
+ // > end of current split - this way we will read only one record out
+ // of this bzip block - the rest of the records from this bzip block
+ // should be read by the next map task while processing the next split
+ if (signalToStopReading) {
+ retPos = endOffsetOfSplit + 1;
+ }
+
+ int retChar = currentChar;
+ switch (currentState) {
+ case START_BLOCK_STATE:
+ break;
+ case RAND_PART_A_STATE:
+ break;
+ case RAND_PART_B_STATE:
+ setupRandPartB();
+ break;
+ case RAND_PART_C_STATE:
+ setupRandPartC();
+ break;
+ case NO_RAND_PART_A_STATE:
+ break;
+ case NO_RAND_PART_B_STATE:
+ setupNoRandPartB();
+ break;
+ case NO_RAND_PART_C_STATE:
+ setupNoRandPartC();
+ break;
+ default:
+ break;
+ }
+ return retChar;
+ }
+ }
+
+ /**
+ * getPos is used by the caller to know when the processing of the current
+ * {@link InputSplit} is complete. In this method, as we read each bzip block,
+ * we keep returning the beginning of the {@link InputSplit} as the return
+ * value until we hit a block which starts at a position >= end of current
+ * split. At that point we should set up retpos such that after a record is
+ * read, future getPos() calls will get a value > end of current split - this
+ * way we will read only one record out of that bzip block - the rest of the
+ * records from that bzip block should be read by the next map task while
+ * processing the next split
+ *
+ * @return
+ * @throws IOException
+ */
+ public long getPos() throws IOException {
+ return retPos;
+ }
+
+ private void initialize(int blockSize) throws IOException {
+ if (blockSize == -1) {
+ char magic1, magic2;
+ char magic3, magic4;
+ magic1 = bsGetUChar();
+ magic2 = bsGetUChar();
+ magic3 = bsGetUChar();
+ magic4 = bsGetUChar();
+ if (magic1 != 'B' || magic2 != 'Z' || magic3 != 'h' || magic4 < '1' || magic4 > '9') {
+ bsFinishedWithStream();
+ streamEnd = true;
+ return;
+ }
+ blockSize = magic4 - '0';
+ }
+
+ setDecompressStructureSizes(blockSize);
+ computedCombinedCRC = 0;
+ }
+
+ private final static long mask = 0xffffffffffffL;
+ private final static long eob = 0x314159265359L & mask;
+ private final static long eos = 0x177245385090L & mask;
+
+ private void initBlock(boolean searchForMagic) throws IOException {
+ if (readCount >= readLimit) {
+ bsFinishedWithStream();
+ streamEnd = true;
+ return;
+ }
+
+ // position before beginning of bzip block header
+ long pos = innerBsStream.getPos();
+ if (!searchForMagic) {
+ char magic1, magic2, magic3, magic4;
+ char magic5, magic6;
+ magic1 = bsGetUChar();
+ magic2 = bsGetUChar();
+ magic3 = bsGetUChar();
+ magic4 = bsGetUChar();
+ magic5 = bsGetUChar();
+ magic6 = bsGetUChar();
+ if (magic1 == 0x17 && magic2 == 0x72 && magic3 == 0x45 && magic4 == 0x38 && magic5 == 0x50 && magic6 == 0x90) {
+ complete();
+ return;
+ }
+
+ if (magic1 != 0x31 || magic2 != 0x41 || magic3 != 0x59 || magic4 != 0x26 || magic5 != 0x53 || magic6 != 0x59) {
+ badBlockHeader();
+ streamEnd = true;
+ return;
+ }
+ } else {
+ long magic = 0;
+ for (int i = 0; i < 6; i++) {
+ magic <<= 8;
+ magic |= bsGetUChar();
+ }
+ while (magic != eos && magic != eob) {
+ magic <<= 1;
+ magic &= mask;
+ magic |= bsR(1);
+ // if we just found the block header, the beginning of the bzip
+ // header would be 6 bytes before the current stream position
+ // when we eventually break from this while(), if it is because
+ // we found a block header then pos will have the correct start
+ // of header position
+ pos = innerBsStream.getPos() - 6;
+ }
+ if (magic == eos) {
+ complete();
+ return;
+ }
+
+ }
+ // if the previous block finished a few bits into the previous byte,
+ // then we will first be reading the remaining bits from the previous
+ // byte - so logically pos needs to be one behind
+ if (bsLive > 0) {
+ pos--;
+ }
+
+ if (pos >= endOffsetOfSplit) {
+ // we have reached a block which begins exactly at the next InputSplit
+ // or >1 byte into the next InputSplit - lets record this fact
+ signalToStopReading = true;
+ }
+ storedBlockCRC = bsGetInt32();
+
+ if (bsR(1) == 1) {
+ blockRandomised = true;
+ } else {
+ blockRandomised = false;
+ }
+
+ // currBlockNo++;
+ getAndMoveToFrontDecode();
+
+ mCrc.initialiseCRC();
+ currentState = START_BLOCK_STATE;
+ }
+
+ private void endBlock() throws IOException {
+ computedBlockCRC = mCrc.getFinalCRC();
+ /* A bad CRC is considered a fatal error. */
+ if (storedBlockCRC != computedBlockCRC) {
+ crcError();
+ }
+
+ computedCombinedCRC = (computedCombinedCRC << 1) | (computedCombinedCRC >>> 31);
+ computedCombinedCRC ^= computedBlockCRC;
+ }
+
+ private void complete() throws IOException {
+ storedCombinedCRC = bsGetInt32();
+ if (checkComputedCombinedCRC && storedCombinedCRC != computedCombinedCRC) {
+ crcError();
+ }
+ if (innerBsStream.getPos() < endOffsetOfSplit) {
+ throw new IOException("Encountered additional bytes in the filesplit past the crc block. "
+ + "Loading of concatenated bz2 files is not supported");
+ }
+ bsFinishedWithStream();
+ streamEnd = true;
+ }
+
+ private static void blockOverrun() throws IOException {
+ cadvise("block overrun");
+ }
+
+ private static void badBlockHeader() throws IOException {
+ cadvise("bad block header");
+ }
+
+ private static void crcError() throws IOException {
+ cadvise("CRC error");
+ }
+
+ private void bsFinishedWithStream() {
+ if (this.innerBsStream != null) {
+ if (this.innerBsStream != System.in) {
+ this.innerBsStream = null;
+ }
+ }
+ }
+
+ private void bsSetStream(FSDataInputStream f) {
+ innerBsStream = f;
+ bsLive = 0;
+ bsBuff = 0;
+ }
+
+ final private int readBs() throws IOException {
+ readCount++;
+ return innerBsStream.read();
+ }
+
+ private int bsR(int n) throws IOException {
+ int v;
+ while (bsLive < n) {
+ int zzi;
+ zzi = readBs();
+ if (zzi == -1) {
+ compressedStreamEOF();
+ }
+ bsBuff = (bsBuff << 8) | (zzi & 0xff);
+ bsLive += 8;
+ }
+
+ v = (bsBuff >> (bsLive - n)) & ((1 << n) - 1);
+ bsLive -= n;
+ return v;
+ }
+
+ private char bsGetUChar() throws IOException {
+ return (char) bsR(8);
+ }
+
+ private int bsGetint() throws IOException {
+ int u = 0;
+ u = (u << 8) | bsR(8);
+ u = (u << 8) | bsR(8);
+ u = (u << 8) | bsR(8);
+ u = (u << 8) | bsR(8);
+ return u;
+ }
+
+ private int bsGetIntVS(int numBits) throws IOException {
+ return bsR(numBits);
+ }
+
+ private int bsGetInt32() throws IOException {
+ return bsGetint();
+ }
+
+ private void hbCreateDecodeTables(int[] limit, int[] base, int[] perm, char[] length, int minLen, int maxLen,
+ int alphaSize) {
+ int pp, i, j, vec;
+
+ pp = 0;
+ for (i = minLen; i <= maxLen; i++) {
+ for (j = 0; j < alphaSize; j++) {
+ if (length[j] == i) {
+ perm[pp] = j;
+ pp++;
+ }
+ }
+ }
+
+ for (i = 0; i < MAX_CODE_LEN; i++) {
+ base[i] = 0;
+ }
+ for (i = 0; i < alphaSize; i++) {
+ base[length[i] + 1]++;
+ }
+
+ for (i = 1; i < MAX_CODE_LEN; i++) {
+ base[i] += base[i - 1];
+ }
+
+ for (i = 0; i < MAX_CODE_LEN; i++) {
+ limit[i] = 0;
+ }
+ vec = 0;
+
+ for (i = minLen; i <= maxLen; i++) {
+ vec += (base[i + 1] - base[i]);
+ limit[i] = vec - 1;
+ vec <<= 1;
+ }
+ for (i = minLen + 1; i <= maxLen; i++) {
+ base[i] = ((limit[i - 1] + 1) << 1) - base[i];
+ }
+ }
+
+ private void recvDecodingTables() throws IOException {
+ char len[][] = new char[N_GROUPS][MAX_ALPHA_SIZE];
+ int i, j, t, nGroups, nSelectors, alphaSize;
+ int minLen, maxLen;
+ boolean[] inUse16 = new boolean[16];
+
+ /* Receive the mapping table */
+ for (i = 0; i < 16; i++) {
+ if (bsR(1) == 1) {
+ inUse16[i] = true;
+ } else {
+ inUse16[i] = false;
+ }
+ }
+
+ for (i = 0; i < 256; i++) {
+ inUse[i] = false;
+ }
+
+ for (i = 0; i < 16; i++) {
+ if (inUse16[i]) {
+ for (j = 0; j < 16; j++) {
+ if (bsR(1) == 1) {
+ inUse[i * 16 + j] = true;
+ }
+ }
+ }
+ }
+
+ makeMaps();
+ alphaSize = nInUse + 2;
+
+ /* Now the selectors */
+ nGroups = bsR(3);
+ nSelectors = bsR(15);
+ for (i = 0; i < nSelectors; i++) {
+ j = 0;
+ while (bsR(1) == 1) {
+ j++;
+ }
+ selectorMtf[i] = (char) j;
+ }
+
+ /* Undo the MTF values for the selectors. */
+ {
+ char[] pos = new char[N_GROUPS];
+ char tmp, v;
+ for (v = 0; v < nGroups; v++) {
+ pos[v] = v;
+ }
+
+ for (i = 0; i < nSelectors; i++) {
+ v = selectorMtf[i];
+ tmp = pos[v];
+ while (v > 0) {
+ pos[v] = pos[v - 1];
+ v--;
+ }
+ pos[0] = tmp;
+ selector[i] = tmp;
+ }
+ }
+
+ /* Now the coding tables */
+ for (t = 0; t < nGroups; t++) {
+ int curr = bsR(5);
+ for (i = 0; i < alphaSize; i++) {
+ while (bsR(1) == 1) {
+ if (bsR(1) == 0) {
+ curr++;
+ } else {
+ curr--;
+ }
+ }
+ len[t][i] = (char) curr;
+ }
+ }
+
+ /* Create the Huffman decoding tables */
+ for (t = 0; t < nGroups; t++) {
+ minLen = 32;
+ maxLen = 0;
+ for (i = 0; i < alphaSize; i++) {
+ if (len[t][i] > maxLen) {
+ maxLen = len[t][i];
+ }
+ if (len[t][i] < minLen) {
+ minLen = len[t][i];
+ }
+ }
+ hbCreateDecodeTables(limit[t], base[t], perm[t], len[t], minLen, maxLen, alphaSize);
+ minLens[t] = minLen;
+ }
+ }
+
+ private void getAndMoveToFrontDecode() throws IOException {
+ char[] yy = new char[256];
+ int i, j, nextSym, limitLast;
+ int EOB, groupNo, groupPos;
+
+ limitLast = baseBlockSize * blockSize100k;
+ origPtr = bsGetIntVS(24);
+
+ recvDecodingTables();
+ EOB = nInUse + 1;
+ groupNo = -1;
+ groupPos = 0;
+
+ /*
+ * Setting up the unzftab entries here is not strictly necessary, but it
+ * does save having to do it later in a separate pass, and so saves a
+ * block's worth of cache misses.
+ */
+ for (i = 0; i <= 255; i++) {
+ unzftab[i] = 0;
+ }
+
+ for (i = 0; i <= 255; i++) {
+ yy[i] = (char) i;
+ }
+
+ last = -1;
+
+ {
+ int zt, zn, zvec, zj;
+ if (groupPos == 0) {
+ groupNo++;
+ groupPos = G_SIZE;
+ }
+ groupPos--;
+ zt = selector[groupNo];
+ zn = minLens[zt];
+ zvec = bsR(zn);
+ while (zvec > limit[zt][zn]) {
+ zn++;
+ {
+ {
+ while (bsLive < 1) {
+ int zzi = 0;
+ try {
+ zzi = readBs();
+ } catch (IOException e) {
+ compressedStreamEOF();
+ }
+ if (zzi == -1) {
+ compressedStreamEOF();
+ }
+ bsBuff = (bsBuff << 8) | (zzi & 0xff);
+ bsLive += 8;
+ }
+ }
+ zj = (bsBuff >> (bsLive - 1)) & 1;
+ bsLive--;
+ }
+ zvec = (zvec << 1) | zj;
+ }
+ nextSym = perm[zt][zvec - base[zt][zn]];
+ }
+
+ while (true) {
+
+ if (nextSym == EOB) {
+ break;
+ }
+
+ if (nextSym == RUNA || nextSym == RUNB) {
+ char ch;
+ int s = -1;
+ int N = 1;
+ do {
+ if (nextSym == RUNA) {
+ s = s + (0 + 1) * N;
+ } else if (nextSym == RUNB) {
+ s = s + (1 + 1) * N;
+ }
+ N = N * 2;
+ {
+ int zt, zn, zvec, zj;
+ if (groupPos == 0) {
+ groupNo++;
+ groupPos = G_SIZE;
+ }
+ groupPos--;
+ zt = selector[groupNo];
+ zn = minLens[zt];
+ zvec = bsR(zn);
+ while (zvec > limit[zt][zn]) {
+ zn++;
+ {
+ {
+ while (bsLive < 1) {
+ int zzi = 0;
+ try {
+ zzi = readBs();
+ } catch (IOException e) {
+ compressedStreamEOF();
+ }
+ if (zzi == -1) {
+ compressedStreamEOF();
+ }
+ bsBuff = (bsBuff << 8) | (zzi & 0xff);
+ bsLive += 8;
+ }
+ }
+ zj = (bsBuff >> (bsLive - 1)) & 1;
+ bsLive--;
+ }
+ zvec = (zvec << 1) | zj;
+ }
+ nextSym = perm[zt][zvec - base[zt][zn]];
+ }
+ } while (nextSym == RUNA || nextSym == RUNB);
+
+ s++;
+ ch = seqToUnseq[yy[0]];
+ unzftab[ch] += s;
+
+ while (s > 0) {
+ last++;
+ ll8[last] = ch;
+ s--;
+ }
+
+ if (last >= limitLast) {
+ blockOverrun();
+ }
+ continue;
+ } else {
+ char tmp;
+ last++;
+ if (last >= limitLast) {
+ blockOverrun();
+ }
+
+ tmp = yy[nextSym - 1];
+ unzftab[seqToUnseq[tmp]]++;
+ ll8[last] = seqToUnseq[tmp];
+
+ /*
+ * This loop is hammered during decompression, hence the unrolling.
+ *
+ * for (j = nextSym-1; j > 0; j--) yy[j] = yy[j-1];
+ */
+
+ j = nextSym - 1;
+ for (; j > 3; j -= 4) {
+ yy[j] = yy[j - 1];
+ yy[j - 1] = yy[j - 2];
+ yy[j - 2] = yy[j - 3];
+ yy[j - 3] = yy[j - 4];
+ }
+ for (; j > 0; j--) {
+ yy[j] = yy[j - 1];
+ }
+
+ yy[0] = tmp;
+ {
+ int zt, zn, zvec, zj;
+ if (groupPos == 0) {
+ groupNo++;
+ groupPos = G_SIZE;
+ }
+ groupPos--;
+ zt = selector[groupNo];
+ zn = minLens[zt];
+ zvec = bsR(zn);
+ while (zvec > limit[zt][zn]) {
+ zn++;
+ {
+ {
+ while (bsLive < 1) {
+ int zzi;
+ char thech = 0;
+ try {
+ thech = (char) readBs();
+ } catch (IOException e) {
+ compressedStreamEOF();
+ }
+ zzi = thech;
+ bsBuff = (bsBuff << 8) | (zzi & 0xff);
+ bsLive += 8;
+ }
+ }
+ zj = (bsBuff >> (bsLive - 1)) & 1;
+ bsLive--;
+ }
+ zvec = (zvec << 1) | zj;
+ }
+ nextSym = perm[zt][zvec - base[zt][zn]];
+ }
+ continue;
+ }
+ }
+ }
+
+ private void setupBlock() throws IOException {
+ int[] cftab = new int[257];
+ char ch;
+
+ cftab[0] = 0;
+ for (i = 1; i <= 256; i++) {
+ cftab[i] = unzftab[i - 1];
+ }
+ for (i = 1; i <= 256; i++) {
+ cftab[i] += cftab[i - 1];
+ }
+
+ for (i = 0; i <= last; i++) {
+ ch = ll8[i];
+ tt[cftab[ch]] = i;
+ cftab[ch]++;
+ }
+ cftab = null;
+
+ tPos = tt[origPtr];
+
+ count = 0;
+ i2 = 0;
+ ch2 = 256; /* not a char and not EOF */
+
+ if (blockRandomised) {
+ rNToGo = 0;
+ rTPos = 0;
+ setupRandPartA();
+ } else {
+ setupNoRandPartA();
+ }
+ }
+
+ private void setupRandPartA() throws IOException {
+ if (i2 <= last) {
+ chPrev = ch2;
+ ch2 = ll8[tPos];
+ tPos = tt[tPos];
+ if (rNToGo == 0) {
+ rNToGo = rNums[rTPos];
+ rTPos++;
+ if (rTPos == 512) {
+ rTPos = 0;
+ }
+ }
+ rNToGo--;
+ ch2 ^= ((rNToGo == 1) ? 1 : 0);
+ i2++;
+
+ currentChar = ch2;
+ currentState = RAND_PART_B_STATE;
+ mCrc.updateCRC(ch2);
+ } else {
+ endBlock();
+ initBlock(false);
+ setupBlock();
+ }
+ }
+
+ private void setupNoRandPartA() throws IOException {
+ if (i2 <= last) {
+ chPrev = ch2;
+ ch2 = ll8[tPos];
+ tPos = tt[tPos];
+ i2++;
+
+ currentChar = ch2;
+ currentState = NO_RAND_PART_B_STATE;
+ mCrc.updateCRC(ch2);
+ } else {
+ endBlock();
+ initBlock(false);
+ setupBlock();
+ }
+ }
+
+ private void setupRandPartB() throws IOException {
+ if (ch2 != chPrev) {
+ currentState = RAND_PART_A_STATE;
+ count = 1;
+ setupRandPartA();
+ } else {
+ count++;
+ if (count >= 4) {
+ z = ll8[tPos];
+ tPos = tt[tPos];
+ if (rNToGo == 0) {
+ rNToGo = rNums[rTPos];
+ rTPos++;
+ if (rTPos == 512) {
+ rTPos = 0;
+ }
+ }
+ rNToGo--;
+ z ^= ((rNToGo == 1) ? 1 : 0);
+ j2 = 0;
+ currentState = RAND_PART_C_STATE;
+ setupRandPartC();
+ } else {
+ currentState = RAND_PART_A_STATE;
+ setupRandPartA();
+ }
+ }
+ }
+
+ private void setupRandPartC() throws IOException {
+ if (j2 < (int) z) {
+ currentChar = ch2;
+ mCrc.updateCRC(ch2);
+ j2++;
+ } else {
+ currentState = RAND_PART_A_STATE;
+ i2++;
+ count = 0;
+ setupRandPartA();
+ }
+ }
+
+ private void setupNoRandPartB() throws IOException {
+ if (ch2 != chPrev) {
+ currentState = NO_RAND_PART_A_STATE;
+ count = 1;
+ setupNoRandPartA();
+ } else {
+ count++;
+ if (count >= 4) {
+ z = ll8[tPos];
+ tPos = tt[tPos];
+ currentState = NO_RAND_PART_C_STATE;
+ j2 = 0;
+ setupNoRandPartC();
+ } else {
+ currentState = NO_RAND_PART_A_STATE;
+ setupNoRandPartA();
+ }
+ }
+ }
+
+ private void setupNoRandPartC() throws IOException {
+ if (j2 < (int) z) {
+ currentChar = ch2;
+ mCrc.updateCRC(ch2);
+ j2++;
+ } else {
+ currentState = NO_RAND_PART_A_STATE;
+ i2++;
+ count = 0;
+ setupNoRandPartA();
+ }
+ }
+
+ private void setDecompressStructureSizes(int newSize100k) {
+ if (!(0 <= newSize100k && newSize100k <= 9 && 0 <= blockSize100k && blockSize100k <= 9)) {
+ // throw new IOException("Invalid block size");
+ }
+
+ blockSize100k = newSize100k;
+
+ if (newSize100k == 0) {
+ return;
+ }
+
+ int n = baseBlockSize * newSize100k;
+ ll8 = new char[n];
+ tt = new int[n];
+ }
+
+ private static class CRC {
+ public static int crc32Table[] = { 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b,
+ 0x1a864db2, 0x1e475005, 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, 0x350c9b64, 0x31cd86d3, 0x3c8ea00a,
+ 0x384fbdbd, 0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75,
+ 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd, 0x9823b6e0,
+ 0x9ce2ab57, 0x91a18d8e, 0x95609039, 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, 0xbe2b5b58, 0xbaea46ef,
+ 0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d, 0xd4326d90, 0xd0f37027, 0xddb056fe,
+ 0xd9714b49, 0xc7361b4c, 0xc3f706fb, 0xceb42022, 0xca753d95, 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1,
+ 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, 0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae, 0x278206ab,
+ 0x23431b1c, 0x2e003dc5, 0x2ac12072, 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4,
+ 0x0808d07d, 0x0cc9cdca, 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde, 0x6b93dddb, 0x6f52c06c, 0x6211e6b5,
+ 0x66d0fb02, 0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba,
+ 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692, 0x8aad2b2f,
+ 0x8e6c3698, 0x832f1041, 0x87ee0df6, 0x99a95df3, 0x9d684044, 0x902b669d, 0x94ea7b2a, 0xe0b41de7, 0xe4750050,
+ 0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2, 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31,
+ 0xcbffd686, 0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a, 0x690ce0ee, 0x6dcdfd59, 0x608edb80, 0x644fc637,
+ 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb, 0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f, 0x5c007b8a,
+ 0x58c1663d, 0x558240e4, 0x51435d53, 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, 0x36194d42, 0x32d850f5,
+ 0x3f9b762c, 0x3b5a6b9b, 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff, 0x1011a0fa, 0x14d0bd4d, 0x19939b94,
+ 0x1d528623, 0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, 0xef68060b,
+ 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3, 0xbd3e8d7e,
+ 0xb9ff90c9, 0xb4bcb610, 0xb07daba7, 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, 0x9b3660c6, 0x9ff77d71,
+ 0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3, 0x5d8a9099, 0x594b8d2e, 0x5408abf7,
+ 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c, 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8,
+ 0x68860bfd, 0x6c47164a, 0x61043093, 0x65c52d24, 0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30, 0x029f3d35,
+ 0x065e2082, 0x0b1d065b, 0x0fdc1bec, 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a,
+ 0x2d15ebe3, 0x29d4f654, 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0, 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb,
+ 0xdbee767c, 0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4,
+ 0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c, 0xafb010b1,
+ 0xab710d06, 0xa6322bdf, 0xa2f33668, 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4 };
+
+ public CRC() {
+ initialiseCRC();
+ }
+
+ void initialiseCRC() {
+ globalCrc = 0xffffffff;
+ }
+
+ int getFinalCRC() {
+ return ~globalCrc;
+ }
+
+ void updateCRC(int inCh) {
+ int temp = (globalCrc >> 24) ^ inCh;
+ if (temp < 0) {
+ temp = 256 + temp;
+ }
+ globalCrc = (globalCrc << 8) ^ CRC.crc32Table[temp];
+ }
+
+ int globalCrc;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/text/LineParser.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/LineParser.java b/crunch-core/src/main/java/org/apache/crunch/io/text/LineParser.java
new file mode 100644
index 0000000..9438014
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/text/LineParser.java
@@ -0,0 +1,125 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.text;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.StringTokenizer;
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Pair;
+import org.apache.crunch.fn.CompositeMapFn;
+import org.apache.crunch.fn.IdentityFn;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+
+import com.google.common.base.Splitter;
+import com.google.common.collect.ImmutableList;
+
+/**
+ * An abstraction for parsing the lines of a text file using a {@code PType<T>} to
+ * convert the lines of text into a given data type.
+ *
+ * @param <T> The type returned by the text parsing
+ */
+abstract class LineParser<T> {
+
+ public static <S> LineParser<S> forType(PType<S> ptype) {
+ return new SimpleLineParser<S>(ptype);
+ }
+
+ public static <K, V> LineParser<Pair<K, V>> forTableType(PTableType<K, V> ptt, String sep) {
+ return new KeyValueLineParser<K, V>(ptt, sep);
+ }
+
+ private MapFn<String, T> mapFn;
+
+ public void initialize() {
+ mapFn = getMapFn();
+ mapFn.initialize();
+ }
+
+ public T parse(String line) {
+ return mapFn.map(line);
+ }
+
+ protected abstract MapFn<String, T> getMapFn();
+
+ private static <T> MapFn<String, T> getMapFnForPType(PType<T> ptype) {
+ MapFn ret = null;
+ if (String.class.equals(ptype.getTypeClass())) {
+ ret = (MapFn) IdentityFn.getInstance();
+ } else {
+ // Check for a composite MapFn for the PType.
+ // Note that this won't work for Avro-- need to solve that.
+ ret = ptype.getInputMapFn();
+ if (ret instanceof CompositeMapFn) {
+ ret = ((CompositeMapFn) ret).getSecond();
+ }
+ }
+ return ret;
+ }
+
+ private static class SimpleLineParser<S> extends LineParser<S> {
+
+ private final PType<S> ptype;
+
+ public SimpleLineParser(PType<S> ptype) {
+ this.ptype = ptype;
+ }
+
+ @Override
+ protected MapFn<String, S> getMapFn() {
+ return getMapFnForPType(ptype);
+ }
+ }
+
+ private static class KeyValueLineParser<K, V> extends LineParser<Pair<K, V>> {
+
+ private final PTableType<K, V> ptt;
+ private final String sep;
+
+ public KeyValueLineParser(PTableType<K, V> ptt, String sep) {
+ this.ptt = ptt;
+ this.sep = sep;
+ }
+
+ @Override
+ protected MapFn<String, Pair<K, V>> getMapFn() {
+ final MapFn<String, K> keyMapFn = getMapFnForPType(ptt.getKeyType());
+ final MapFn<String, V> valueMapFn = getMapFnForPType(ptt.getValueType());
+
+ return new MapFn<String, Pair<K, V>>() {
+ @Override
+ public void initialize() {
+ keyMapFn.initialize();
+ valueMapFn.initialize();
+ }
+
+ @Override
+ public Pair<K, V> map(String input) {
+ List<String> kv = ImmutableList.copyOf(Splitter.on(sep).limit(1).split(input));
+ if (kv.size() != 2) {
+ throw new RuntimeException("Invalid input string: " + input);
+ }
+ return Pair.of(keyMapFn.map(kv.get(0)), valueMapFn.map(kv.get(1)));
+ }
+ };
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/text/NLineFileSource.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/NLineFileSource.java b/crunch-core/src/main/java/org/apache/crunch/io/text/NLineFileSource.java
new file mode 100644
index 0000000..40e2dbd
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/text/NLineFileSource.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.text;
+
+import java.io.IOException;
+
+import org.apache.crunch.io.CompositePathIterable;
+import org.apache.crunch.io.FormatBundle;
+import org.apache.crunch.io.ReadableSource;
+import org.apache.crunch.io.impl.FileSourceImpl;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
+
+/**
+ * A {@code Source} instance that uses the {@code NLineInputFormat}, which gives each map
+ * task a fraction of the lines in a text file as input. Most useful when running simulations
+ * on Hadoop, where each line represents configuration information about each simulation
+ * run.
+ */
+public class NLineFileSource<T> extends FileSourceImpl<T> implements ReadableSource<T> {
+
+ private static FormatBundle getBundle(int linesPerTask) {
+ FormatBundle bundle = FormatBundle.forInput(NLineInputFormat.class);
+ bundle.set(NLineInputFormat.LINES_PER_MAP, String.valueOf(linesPerTask));
+ return bundle;
+ }
+
+ /**
+ * Create a new {@code NLineFileSource} instance.
+ *
+ * @param path The path to the input data, as a String
+ * @param ptype The PType to use for processing the data
+ * @param linesPerTask The number of lines from the input each map task will process
+ */
+ public NLineFileSource(String path, PType<T> ptype, int linesPerTask) {
+ this(new Path(path), ptype, linesPerTask);
+ }
+
+ /**
+ * Create a new {@code NLineFileSource} instance.
+ *
+ * @param path The {@code Path} to the input data
+ * @param ptype The PType to use for processing the data
+ * @param linesPerTask The number of lines from the input each map task will process
+ */
+ public NLineFileSource(Path path, PType<T> ptype, int linesPerTask) {
+ super(path, ptype, getBundle(linesPerTask));
+ }
+
+ @Override
+ public String toString() {
+ return "NLine(" + path + ")";
+ }
+
+ @Override
+ public Iterable<T> read(Configuration conf) throws IOException {
+ return CompositePathIterable.create(path.getFileSystem(conf), path,
+ new TextFileReaderFactory<T>(LineParser.forType(ptype)));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileReaderFactory.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileReaderFactory.java b/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileReaderFactory.java
new file mode 100644
index 0000000..e1fea6e
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileReaderFactory.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.text;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.Iterator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.io.FileReaderFactory;
+import org.apache.crunch.io.impl.AutoClosingIterator;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import com.google.common.collect.Iterators;
+import com.google.common.collect.UnmodifiableIterator;
+
+public class TextFileReaderFactory<T> implements FileReaderFactory<T> {
+
+ private static final Log LOG = LogFactory.getLog(TextFileReaderFactory.class);
+
+ private final LineParser<T> parser;
+
+ public TextFileReaderFactory(PType<T> ptype) {
+ this(LineParser.forType(ptype));
+ }
+
+ public TextFileReaderFactory(LineParser<T> parser) {
+ this.parser = parser;
+ }
+
+ @Override
+ public Iterator<T> read(FileSystem fs, Path path) {
+ parser.initialize();
+
+ FSDataInputStream is;
+ try {
+ is = fs.open(path);
+ } catch (IOException e) {
+ LOG.info("Could not read path: " + path, e);
+ return Iterators.emptyIterator();
+ }
+
+ final BufferedReader reader = new BufferedReader(new InputStreamReader(is));
+ return new AutoClosingIterator<T>(reader, new UnmodifiableIterator<T>() {
+ private String nextLine;
+
+ @Override
+ public boolean hasNext() {
+ try {
+ return (nextLine = reader.readLine()) != null;
+ } catch (IOException e) {
+ LOG.info("Exception reading text file stream", e);
+ return false;
+ }
+ }
+
+ @Override
+ public T next() {
+ return parser.parse(nextLine);
+ }
+ });
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileSource.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileSource.java b/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileSource.java
new file mode 100644
index 0000000..026fca9
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileSource.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.text;
+
+import java.io.IOException;
+
+import org.apache.crunch.io.CompositePathIterable;
+import org.apache.crunch.io.ReadableSource;
+import org.apache.crunch.io.impl.FileSourceImpl;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.avro.AvroTypeFamily;
+import org.apache.crunch.types.avro.AvroUtf8InputFormat;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+
+public class TextFileSource<T> extends FileSourceImpl<T> implements ReadableSource<T> {
+
+ private static boolean isBZip2(Path path) {
+ String strPath = path.toString();
+ return strPath.endsWith(".bz") || strPath.endsWith(".bz2");
+ }
+
+ private static <S> Class<? extends FileInputFormat<?, ?>> getInputFormat(Path path, PType<S> ptype) {
+ if (ptype.getFamily().equals(AvroTypeFamily.getInstance())) {
+ return AvroUtf8InputFormat.class;
+ } else if (isBZip2(path)) {
+ return BZip2TextInputFormat.class;
+ } else {
+ return TextInputFormat.class;
+ }
+ }
+
+ public TextFileSource(Path path, PType<T> ptype) {
+ super(path, ptype, getInputFormat(path, ptype));
+ }
+
+ @Override
+ public long getSize(Configuration conf) {
+ long sz = super.getSize(conf);
+ if (isBZip2(path)) {
+ sz *= 10; // Arbitrary compression factor
+ }
+ return sz;
+ }
+
+ @Override
+ public String toString() {
+ return "Text(" + path + ")";
+ }
+
+ @Override
+ public Iterable<T> read(Configuration conf) throws IOException {
+ return CompositePathIterable.create(path.getFileSystem(conf), path,
+ new TextFileReaderFactory<T>(LineParser.forType(ptype)));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileSourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileSourceTarget.java b/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileSourceTarget.java
new file mode 100644
index 0000000..1d1211e
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileSourceTarget.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.text;
+
+import org.apache.crunch.io.FileNamingScheme;
+import org.apache.crunch.io.SequentialFileNamingScheme;
+import org.apache.crunch.io.impl.ReadableSourcePathTargetImpl;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.fs.Path;
+
+public class TextFileSourceTarget<T> extends ReadableSourcePathTargetImpl<T> {
+
+ public TextFileSourceTarget(String path, PType<T> ptype) {
+ this(new Path(path), ptype);
+ }
+
+ public TextFileSourceTarget(Path path, PType<T> ptype) {
+ this(path, ptype, new SequentialFileNamingScheme());
+ }
+
+ public TextFileSourceTarget(Path path, PType<T> ptype, FileNamingScheme fileNamingScheme) {
+ super(new TextFileSource<T>(path, ptype), new TextFileTarget(path), fileNamingScheme);
+ }
+
+ @Override
+ public String toString() {
+ return target.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileTableSource.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileTableSource.java b/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileTableSource.java
new file mode 100644
index 0000000..94fc5fd
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileTableSource.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.text;
+
+import java.io.IOException;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.io.CompositePathIterable;
+import org.apache.crunch.io.FormatBundle;
+import org.apache.crunch.io.ReadableSource;
+import org.apache.crunch.io.impl.FileTableSourceImpl;
+import org.apache.crunch.types.PTableType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
+
+/**
+ * A {@code Source} that uses the {@code KeyValueTextInputFormat} to process
+ * input text. If a separator for the keys and values in the text file is not specified,
+ * a tab character is used.
+ */
+public class TextFileTableSource<K, V> extends FileTableSourceImpl<K, V>
+ implements ReadableSource<Pair<K, V>> {
+
+ // CRUNCH-125: Maintain compatibility with both versions of the KeyValueTextInputFormat's
+ // configuration field for specifying the separator character.
+ private static final String OLD_KV_SEP = "key.value.separator.in.input.line";
+ private static final String NEW_KV_SEP = "mapreduce.input.keyvaluelinerecordreader.key.value.separator";
+
+ private static FormatBundle getBundle(String sep) {
+ FormatBundle bundle = FormatBundle.forInput(KeyValueTextInputFormat.class);
+ bundle.set(OLD_KV_SEP, sep);
+ bundle.set(NEW_KV_SEP, sep);
+ return bundle;
+ }
+
+ private final String separator;
+
+ public TextFileTableSource(String path, PTableType<K, V> tableType) {
+ this(new Path(path), tableType);
+ }
+
+ public TextFileTableSource(Path path, PTableType<K, V> tableType) {
+ this(path, tableType, "\t");
+ }
+
+ public TextFileTableSource(String path, PTableType<K, V> tableType, String separator) {
+ this(new Path(path), tableType, separator);
+ }
+
+ public TextFileTableSource(Path path, PTableType<K, V> tableType, String separator) {
+ super(path, tableType, getBundle(separator));
+ this.separator = separator;
+ }
+
+ @Override
+ public String toString() {
+ return "KeyValueText(" + path + ")";
+ }
+
+ @Override
+ public Iterable<Pair<K, V>> read(Configuration conf) throws IOException {
+ return CompositePathIterable.create(path.getFileSystem(conf), path,
+ new TextFileReaderFactory<Pair<K, V>>(LineParser.forTableType(getTableType(), separator)));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileTableSourceTarget.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileTableSourceTarget.java b/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileTableSourceTarget.java
new file mode 100644
index 0000000..dec97e5
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/text/TextFileTableSourceTarget.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io.text;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.TableSourceTarget;
+import org.apache.crunch.io.FileNamingScheme;
+import org.apache.crunch.io.SequentialFileNamingScheme;
+import org.apache.crunch.io.impl.ReadableSourcePathTargetImpl;
+import org.apache.crunch.types.PTableType;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * A {@code TableSource} and {@code SourceTarget} implementation that uses the
+ * {@code KeyValueTextInputFormat} and {@code TextOutputFormat} to support reading
+ * and writing text files as {@code PTable} instances using a tab separator for
+ * the keys and the values.
+ */
+public class TextFileTableSourceTarget<K, V> extends ReadableSourcePathTargetImpl<Pair<K, V>> implements
+ TableSourceTarget<K, V> {
+
+ private final PTableType<K, V> tableType;
+
+ public TextFileTableSourceTarget(String path, PTableType<K, V> tableType) {
+ this(new Path(path), tableType);
+ }
+
+ public TextFileTableSourceTarget(Path path, PTableType<K, V> tableType) {
+ this(path, tableType, new SequentialFileNamingScheme());
+ }
+
+ public TextFileTableSourceTarget(Path path, PTableType<K, V> tableType,
+ FileNamingScheme fileNamingScheme) {
+ super(new TextFileTableSource<K, V>(path, tableType), new TextFileTarget(path),
+ fileNamingScheme);
+ this.tableType = tableType;
+ }
+
+ @Override
+ public PTableType<K, V> getTableType() {
+ return tableType;
+ }
+
+ @Override
+ public String toString() {
+ return target.toString();
+ }
+}
[08/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/io/text/TextFileTarget.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/io/text/TextFileTarget.java b/crunch/src/main/java/org/apache/crunch/io/text/TextFileTarget.java
deleted file mode 100644
index 0c3e6a4..0000000
--- a/crunch/src/main/java/org/apache/crunch/io/text/TextFileTarget.java
+++ /dev/null
@@ -1,109 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.io.text;
-
-import org.apache.avro.Schema;
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.io.FileNamingScheme;
-import org.apache.crunch.io.SequentialFileNamingScheme;
-import org.apache.crunch.io.impl.FileTargetImpl;
-import org.apache.crunch.types.Converter;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.avro.AvroTextOutputFormat;
-import org.apache.crunch.types.avro.AvroType;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.writable.WritableType;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
-
-public class TextFileTarget extends FileTargetImpl {
- private static Class<? extends FileOutputFormat> getOutputFormat(PType<?> ptype) {
- if (ptype.getFamily().equals(AvroTypeFamily.getInstance())) {
- return AvroTextOutputFormat.class;
- } else {
- return TextOutputFormat.class;
- }
- }
-
- public <T> TextFileTarget(String path) {
- this(new Path(path));
- }
-
- public <T> TextFileTarget(Path path) {
- this(path, new SequentialFileNamingScheme());
- }
-
- public <T> TextFileTarget(Path path, FileNamingScheme fileNamingScheme) {
- super(path, null, fileNamingScheme);
- }
-
- @Override
- public Path getPath() {
- return path;
- }
-
- @Override
- public String toString() {
- return "Text(" + path + ")";
- }
-
- @Override
- public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) {
- Converter converter = ptype.getConverter();
- Class keyClass = converter.getKeyClass();
- Class valueClass = converter.getValueClass();
- configureForMapReduce(job, keyClass, valueClass, getOutputFormat(ptype), outputPath, name);
- }
-
- @Override
- public <T> SourceTarget<T> asSourceTarget(PType<T> ptype) {
- if (!isTextCompatible(ptype)) {
- return null;
- }
- if (ptype instanceof PTableType) {
- return new TextFileTableSourceTarget(path, (PTableType) ptype);
- }
- return new TextFileSourceTarget<T>(path, ptype);
- }
-
- private <T> boolean isTextCompatible(PType<T> ptype) {
- if (AvroTypeFamily.getInstance().equals(ptype.getFamily())) {
- AvroType<T> at = (AvroType<T>) ptype;
- if (at.getSchema().equals(Schema.create(Schema.Type.STRING))) {
- return true;
- }
- } else if (WritableTypeFamily.getInstance().equals(ptype.getFamily())) {
- if (ptype instanceof PTableType) {
- PTableType ptt = (PTableType) ptype;
- return isText(ptt.getKeyType()) && isText(ptt.getValueType());
- } else {
- return isText(ptype);
- }
- }
- return false;
- }
-
- private <T> boolean isText(PType<T> wtype) {
- return Text.class.equals(((WritableType) wtype).getSerializationClass());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/Aggregate.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/Aggregate.java b/crunch/src/main/java/org/apache/crunch/lib/Aggregate.java
deleted file mode 100644
index d4109cc..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/Aggregate.java
+++ /dev/null
@@ -1,272 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-import java.util.PriorityQueue;
-
-import org.apache.crunch.CombineFn;
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.GroupingOptions;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PObject;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.fn.Aggregators;
-import org.apache.crunch.fn.MapValuesFn;
-import org.apache.crunch.materialize.pobject.FirstElementPObject;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-
-import com.google.common.collect.Lists;
-
-/**
- * Methods for performing various types of aggregations over {@link PCollection} instances.
- *
- */
-public class Aggregate {
-
- /**
- * Returns a {@code PTable} that contains the unique elements of this collection mapped to a count
- * of their occurrences.
- */
- public static <S> PTable<S, Long> count(PCollection<S> collect) {
- PTypeFamily tf = collect.getTypeFamily();
- return collect.parallelDo("Aggregate.count", new MapFn<S, Pair<S, Long>>() {
- public Pair<S, Long> map(S input) {
- return Pair.of(input, 1L);
- }
- }, tf.tableOf(collect.getPType(), tf.longs())).groupByKey()
- .combineValues(Aggregators.SUM_LONGS());
- }
-
- /**
- * Returns the number of elements in the provided PCollection.
- *
- * @param collect The PCollection whose elements should be counted.
- * @param <S> The type of the PCollection.
- * @return A {@code PObject} containing the number of elements in the {@code PCollection}.
- */
- public static <S> PObject<Long> length(PCollection<S> collect) {
- PTypeFamily tf = collect.getTypeFamily();
- PTable<Integer, Long> countTable = collect
- .parallelDo("Aggregate.count", new MapFn<S, Pair<Integer, Long>>() {
- public Pair<Integer, Long> map(S input) {
- return Pair.of(1, 1L);
- }
- }, tf.tableOf(tf.ints(), tf.longs()))
- .groupByKey(GroupingOptions.builder().numReducers(1).build())
- .combineValues(Aggregators.SUM_LONGS());
- PCollection<Long> count = countTable.values();
- return new FirstElementPObject<Long>(count);
- }
-
- public static class PairValueComparator<K, V> implements Comparator<Pair<K, V>> {
- private final boolean ascending;
-
- public PairValueComparator(boolean ascending) {
- this.ascending = ascending;
- }
-
- @Override
- public int compare(Pair<K, V> left, Pair<K, V> right) {
- int cmp = ((Comparable<V>) left.second()).compareTo(right.second());
- return ascending ? cmp : -cmp;
- }
- }
-
- public static class TopKFn<K, V> extends DoFn<Pair<K, V>, Pair<Integer, Pair<K, V>>> {
-
- private final int limit;
- private final boolean maximize;
- private transient PriorityQueue<Pair<K, V>> values;
-
- public TopKFn(int limit, boolean ascending) {
- this.limit = limit;
- this.maximize = ascending;
- }
-
- public void initialize() {
- this.values = new PriorityQueue<Pair<K, V>>(limit, new PairValueComparator<K, V>(maximize));
- }
-
- public void process(Pair<K, V> input, Emitter<Pair<Integer, Pair<K, V>>> emitter) {
- values.add(input);
- if (values.size() > limit) {
- values.poll();
- }
- }
-
- public void cleanup(Emitter<Pair<Integer, Pair<K, V>>> emitter) {
- for (Pair<K, V> p : values) {
- emitter.emit(Pair.of(0, p));
- }
- }
- }
-
- public static class TopKCombineFn<K, V> extends CombineFn<Integer, Pair<K, V>> {
-
- private final int limit;
- private final boolean maximize;
-
- public TopKCombineFn(int limit, boolean maximize) {
- this.limit = limit;
- this.maximize = maximize;
- }
-
- @Override
- public void process(Pair<Integer, Iterable<Pair<K, V>>> input,
- Emitter<Pair<Integer, Pair<K, V>>> emitter) {
- Comparator<Pair<K, V>> cmp = new PairValueComparator<K, V>(maximize);
- PriorityQueue<Pair<K, V>> queue = new PriorityQueue<Pair<K, V>>(limit, cmp);
- for (Pair<K, V> pair : input.second()) {
- queue.add(pair);
- if (queue.size() > limit) {
- queue.poll();
- }
- }
-
- List<Pair<K, V>> values = Lists.newArrayList(queue);
- Collections.sort(values, cmp);
- for (int i = values.size() - 1; i >= 0; i--) {
- emitter.emit(Pair.of(0, values.get(i)));
- }
- }
- }
-
- public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) {
- PTypeFamily ptf = ptable.getTypeFamily();
- PTableType<K, V> base = ptable.getPTableType();
- PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType());
- PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType);
- return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize), inter)
- .groupByKey(1).combineValues(new TopKCombineFn<K, V>(limit, maximize))
- .parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() {
- public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) {
- emitter.emit(input.second());
- }
- }, base);
- }
-
- /**
- * Returns the largest numerical element from the input collection.
- */
- public static <S> PObject<S> max(PCollection<S> collect) {
- Class<S> clazz = collect.getPType().getTypeClass();
- if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
- throw new IllegalArgumentException("Can only get max for Comparable elements, not for: "
- + collect.getPType().getTypeClass());
- }
- PTypeFamily tf = collect.getTypeFamily();
- PCollection<S> maxCollect = PTables.values(collect
- .parallelDo("max", new DoFn<S, Pair<Boolean, S>>() {
- private transient S max = null;
-
- public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
- if (max == null || ((Comparable<S>) max).compareTo(input) < 0) {
- max = input;
- }
- }
-
- public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
- if (max != null) {
- emitter.emit(Pair.of(true, max));
- }
- }
- }, tf.tableOf(tf.booleans(), collect.getPType())).groupByKey(1)
- .combineValues(new CombineFn<Boolean, S>() {
- public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
- S max = null;
- for (S v : input.second()) {
- if (max == null || ((Comparable<S>) max).compareTo(v) < 0) {
- max = v;
- }
- }
- emitter.emit(Pair.of(input.first(), max));
- }
- }));
- return new FirstElementPObject<S>(maxCollect);
- }
-
- /**
- * Returns the smallest numerical element from the input collection.
- */
- public static <S> PObject<S> min(PCollection<S> collect) {
- Class<S> clazz = collect.getPType().getTypeClass();
- if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
- throw new IllegalArgumentException("Can only get min for Comparable elements, not for: "
- + collect.getPType().getTypeClass());
- }
- PTypeFamily tf = collect.getTypeFamily();
- PCollection<S> minCollect = PTables.values(collect
- .parallelDo("min", new DoFn<S, Pair<Boolean, S>>() {
- private transient S min = null;
-
- public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
- if (min == null || ((Comparable<S>) min).compareTo(input) > 0) {
- min = input;
- }
- }
-
- public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
- if (min != null) {
- emitter.emit(Pair.of(false, min));
- }
- }
- }, tf.tableOf(tf.booleans(), collect.getPType())).groupByKey(1)
- .combineValues(new CombineFn<Boolean, S>() {
- public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
- S min = null;
- for (S v : input.second()) {
- if (min == null || ((Comparable<S>) min).compareTo(v) > 0) {
- min = v;
- }
- }
- emitter.emit(Pair.of(input.first(), min));
- }
- }));
- return new FirstElementPObject<S>(minCollect);
- }
-
- public static <K, V> PTable<K, Collection<V>> collectValues(PTable<K, V> collect) {
- PTypeFamily tf = collect.getTypeFamily();
- final PType<V> valueType = collect.getValueType();
- return collect.groupByKey().parallelDo("collect",
- new MapValuesFn<K, Iterable<V>, Collection<V>>() {
-
- @Override
- public void initialize() {
- valueType.initialize(getConfiguration());
- }
-
- public Collection<V> map(Iterable<V> values) {
- List<V> collected = Lists.newArrayList();
- for (V value : values) {
- collected.add(valueType.getDetachedValue(value));
- }
- return collected;
- }
- }, tf.tableOf(collect.getKeyType(), tf.collections(collect.getValueType())));
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/Cartesian.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/Cartesian.java b/crunch/src/main/java/org/apache/crunch/lib/Cartesian.java
deleted file mode 100644
index 08327dd..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/Cartesian.java
+++ /dev/null
@@ -1,216 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import java.util.Random;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PTypeFamily;
-
-/**
- * Utilities for Cartesian products of two {@code PTable} or {@code PCollection}
- * instances.
- */
-@SuppressWarnings("serial")
-public class Cartesian {
-
- /**
- * Helper for building the artificial cross keys. This technique was taken
- * from Pig's CROSS.
- */
- private static class GFCross<V> extends DoFn<V, Pair<Pair<Integer, Integer>, V>> {
-
- private final int constantField;
- private final int parallelism;
- private final Random r;
-
- public GFCross(int constantField, int parallelism) {
- this.constantField = constantField;
- this.parallelism = parallelism;
- this.r = new Random();
- }
-
- public void process(V input, Emitter<Pair<Pair<Integer, Integer>, V>> emitter) {
- int c = r.nextInt(parallelism);
- if (constantField == 0) {
- for (int i = 0; i < parallelism; i++) {
- emitter.emit(Pair.of(Pair.of(c, i), input));
- }
- } else {
- for (int i = 0; i < parallelism; i++) {
- emitter.emit(Pair.of(Pair.of(i, c), input));
- }
- }
- }
- }
-
- static final int DEFAULT_PARALLELISM = 6;
-
- /**
- * Performs a full cross join on the specified {@link PTable}s (using the same
- * strategy as Pig's CROSS operator).
- *
- * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross
- * Join</a>
- * @param left
- * A PTable to perform a cross join on.
- * @param right
- * A PTable to perform a cross join on.
- * @param <K1>
- * Type of left PTable's keys.
- * @param <K2>
- * Type of right PTable's keys.
- * @param <U>
- * Type of the first {@link PTable}'s values
- * @param <V>
- * Type of the second {@link PTable}'s values
- * @return The joined result as tuples of ((K1,K2), (U,V)).
- */
- public static <K1, K2, U, V> PTable<Pair<K1, K2>, Pair<U, V>> cross(PTable<K1, U> left, PTable<K2, V> right) {
- return cross(left, right, DEFAULT_PARALLELISM);
- }
-
- /**
- * Performs a full cross join on the specified {@link PTable}s (using the same
- * strategy as Pig's CROSS operator).
- *
- * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross
- * Join</a>
- * @param left
- * A PTable to perform a cross join on.
- * @param right
- * A PTable to perform a cross join on.
- * @param parallelism
- * The square root of the number of reducers to use. Increasing
- * parallelism also increases copied data.
- * @param <K1>
- * Type of left PTable's keys.
- * @param <K2>
- * Type of right PTable's keys.
- * @param <U>
- * Type of the first {@link PTable}'s values
- * @param <V>
- * Type of the second {@link PTable}'s values
- * @return The joined result as tuples of ((K1,K2), (U,V)).
- */
- public static <K1, K2, U, V> PTable<Pair<K1, K2>, Pair<U, V>> cross(PTable<K1, U> left, PTable<K2, V> right,
- int parallelism) {
-
- /*
- * The strategy here is to simply emulate the following PigLatin: A =
- * foreach table1 generate flatten(GFCross(0, 2)), flatten(*); B = foreach
- * table2 generate flatten(GFCross(1, 2)), flatten(*); C = cogroup A by ($0,
- * $1), B by ($0, $1); result = foreach C generate flatten(A), flatten(B);
- */
-
- PTypeFamily ltf = left.getTypeFamily();
- PTypeFamily rtf = right.getTypeFamily();
-
- PTable<Pair<Integer, Integer>, Pair<K1, U>> leftCross = left.parallelDo(new GFCross<Pair<K1, U>>(0, parallelism),
- ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), ltf.pairs(left.getKeyType(), left.getValueType())));
- PTable<Pair<Integer, Integer>, Pair<K2, V>> rightCross = right.parallelDo(new GFCross<Pair<K2, V>>(1, parallelism),
- rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), rtf.pairs(right.getKeyType(), right.getValueType())));
-
- PTable<Pair<Integer, Integer>, Pair<Pair<K1, U>, Pair<K2, V>>> cg = leftCross.join(rightCross);
-
- PTypeFamily ctf = cg.getTypeFamily();
-
- return cg.parallelDo(
- new MapFn<Pair<Pair<Integer, Integer>, Pair<Pair<K1, U>, Pair<K2, V>>>, Pair<Pair<K1, K2>, Pair<U, V>>>() {
-
- @Override
- public Pair<Pair<K1, K2>, Pair<U, V>> map(Pair<Pair<Integer, Integer>, Pair<Pair<K1, U>, Pair<K2, V>>> input) {
- Pair<Pair<K1, U>, Pair<K2, V>> valuePair = input.second();
- return Pair.of(Pair.of(valuePair.first().first(), valuePair.second().first()),
- Pair.of(valuePair.first().second(), valuePair.second().second()));
- }
- },
- ctf.tableOf(ctf.pairs(left.getKeyType(), right.getKeyType()),
- ctf.pairs(left.getValueType(), right.getValueType())));
- }
-
- /**
- * Performs a full cross join on the specified {@link PCollection}s (using the
- * same strategy as Pig's CROSS operator).
- *
- * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross
- * Join</a>
- * @param left
- * A PCollection to perform a cross join on.
- * @param right
- * A PCollection to perform a cross join on.
- * @param <U>
- * Type of the first {@link PCollection}'s values
- * @param <V>
- * Type of the second {@link PCollection}'s values
- * @return The joined result as tuples of (U,V).
- */
- public static <U, V> PCollection<Pair<U, V>> cross(PCollection<U> left, PCollection<V> right) {
- return cross(left, right, DEFAULT_PARALLELISM);
- }
-
- /**
- * Performs a full cross join on the specified {@link PCollection}s (using the
- * same strategy as Pig's CROSS operator).
- *
- * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross
- * Join</a>
- * @param left
- * A PCollection to perform a cross join on.
- * @param right
- * A PCollection to perform a cross join on.
- * @param <U>
- * Type of the first {@link PCollection}'s values
- * @param <V>
- * Type of the second {@link PCollection}'s values
- * @return The joined result as tuples of (U,V).
- */
- public static <U, V> PCollection<Pair<U, V>> cross(PCollection<U> left, PCollection<V> right, int parallelism) {
-
- PTypeFamily ltf = left.getTypeFamily();
- PTypeFamily rtf = right.getTypeFamily();
-
- PTableType<Pair<Integer, Integer>, U> ptt = ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), left.getPType());
-
- if (ptt == null)
- throw new Error();
-
- PTable<Pair<Integer, Integer>, U> leftCross = left.parallelDo(new GFCross<U>(0, parallelism),
- ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), left.getPType()));
- PTable<Pair<Integer, Integer>, V> rightCross = right.parallelDo(new GFCross<V>(1, parallelism),
- rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), right.getPType()));
-
- PTable<Pair<Integer, Integer>, Pair<U, V>> cg = leftCross.join(rightCross);
-
- PTypeFamily ctf = cg.getTypeFamily();
-
- return cg.parallelDo(new MapFn<Pair<Pair<Integer, Integer>, Pair<U, V>>, Pair<U, V>>() {
- @Override
- public Pair<U, V> map(Pair<Pair<Integer, Integer>, Pair<U, V>> input) {
- return input.second();
- }
- }, ctf.pairs(left.getPType(), right.getPType()));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/Cogroup.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/Cogroup.java b/crunch/src/main/java/org/apache/crunch/lib/Cogroup.java
deleted file mode 100644
index 07d873c..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/Cogroup.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import java.util.Collection;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.fn.MapValuesFn;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-
-import com.google.common.collect.Lists;
-
-public class Cogroup {
-
- /**
- * Co-groups the two {@link PTable} arguments.
- *
- * @return a {@code PTable} representing the co-grouped tables.
- */
- public static <K, U, V> PTable<K, Pair<Collection<U>, Collection<V>>> cogroup(PTable<K, U> left, PTable<K, V> right) {
- PTypeFamily ptf = left.getTypeFamily();
- PType<K> keyType = left.getPTableType().getKeyType();
- PType<U> leftType = left.getPTableType().getValueType();
- PType<V> rightType = right.getPTableType().getValueType();
- PType<Pair<U, V>> itype = ptf.pairs(leftType, rightType);
-
- PTable<K, Pair<U, V>> cgLeft = left.parallelDo("coGroupTag1", new CogroupFn1<K, U, V>(),
- ptf.tableOf(keyType, itype));
- PTable<K, Pair<U, V>> cgRight = right.parallelDo("coGroupTag2", new CogroupFn2<K, U, V>(),
- ptf.tableOf(keyType, itype));
-
- PTable<K, Pair<U, V>> both = cgLeft.union(cgRight);
-
- PType<Pair<Collection<U>, Collection<V>>> otype = ptf.pairs(ptf.collections(leftType), ptf.collections(rightType));
- return both.groupByKey().parallelDo("cogroup",
- new PostGroupFn<K, U, V>(leftType, rightType), ptf.tableOf(keyType, otype));
- }
-
- private static class CogroupFn1<K, V, U> extends MapValuesFn<K, V, Pair<V, U>> {
- @Override
- public Pair<V, U> map(V v) {
- return Pair.of(v, null);
- }
- }
-
- private static class CogroupFn2<K, V, U> extends MapValuesFn<K, U, Pair<V, U>> {
- @Override
- public Pair<V, U> map(U u) {
- return Pair.of(null, u);
- }
- }
-
- private static class PostGroupFn<K, V, U> extends
- DoFn<Pair<K, Iterable<Pair<V, U>>>, Pair<K, Pair<Collection<V>, Collection<U>>>> {
-
- private PType<V> ptypeV;
- private PType<U> ptypeU;
-
- public PostGroupFn(PType<V> ptypeV, PType<U> ptypeU) {
- this.ptypeV = ptypeV;
- this.ptypeU = ptypeU;
- }
-
- @Override
- public void initialize() {
- super.initialize();
- ptypeV.initialize(getConfiguration());
- ptypeU.initialize(getConfiguration());
- }
-
- @Override
- public void process(Pair<K, Iterable<Pair<V, U>>> input,
- Emitter<Pair<K, Pair<Collection<V>, Collection<U>>>> emitter) {
- Collection<V> cv = Lists.newArrayList();
- Collection<U> cu = Lists.newArrayList();
- for (Pair<V, U> pair : input.second()) {
- if (pair.first() != null) {
- cv.add(ptypeV.getDetachedValue(pair.first()));
- } else if (pair.second() != null) {
- cu.add(ptypeU.getDetachedValue(pair.second()));
- }
- }
- emitter.emit(Pair.of(input.first(), Pair.of(cv, cu)));
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/Distinct.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/Distinct.java b/crunch/src/main/java/org/apache/crunch/lib/Distinct.java
deleted file mode 100644
index 994830d..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/Distinct.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import java.util.Set;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Sets;
-
-/**
- * Functions for computing the distinct elements of a {@code PCollection}.
- */
-public final class Distinct {
-
- private static final int DEFAULT_FLUSH_EVERY = 50000;
-
- /**
- * Construct a new {@code PCollection} that contains the unique elements of a
- * given input {@code PCollection}.
- *
- * @param input The input {@code PCollection}
- * @return A new {@code PCollection} that contains the unique elements of the input
- */
- public static <S> PCollection<S> distinct(PCollection<S> input) {
- return distinct(input, DEFAULT_FLUSH_EVERY);
- }
-
- /**
- * A {@code PTable<K, V>} analogue of the {@code distinct} function.
- */
- public static <K, V> PTable<K, V> distinct(PTable<K, V> input) {
- return PTables.asPTable(distinct((PCollection<Pair<K, V>>) input));
- }
-
- /**
- * A {@code distinct} operation that gives the client more control over how frequently
- * elements are flushed to disk in order to allow control over performance or
- * memory consumption.
- *
- * @param input The input {@code PCollection}
- * @param flushEvery Flush the elements to disk whenever we encounter this many unique values
- * @return A new {@code PCollection} that contains the unique elements of the input
- */
- public static <S> PCollection<S> distinct(PCollection<S> input, int flushEvery) {
- Preconditions.checkArgument(flushEvery > 0);
- PType<S> pt = input.getPType();
- PTypeFamily ptf = pt.getFamily();
- return input
- .parallelDo("pre-distinct", new PreDistinctFn<S>(flushEvery, pt), ptf.tableOf(pt, ptf.nulls()))
- .groupByKey()
- .parallelDo("post-distinct", new PostDistinctFn<S>(), pt);
- }
-
- /**
- * A {@code PTable<K, V>} analogue of the {@code distinct} function.
- */
- public static <K, V> PTable<K, V> distinct(PTable<K, V> input, int flushEvery) {
- return PTables.asPTable(distinct((PCollection<Pair<K, V>>) input, flushEvery));
- }
-
- private static class PreDistinctFn<S> extends DoFn<S, Pair<S, Void>> {
- private final Set<S> values = Sets.newHashSet();
- private final int flushEvery;
- private final PType<S> ptype;
-
- public PreDistinctFn(int flushEvery, PType<S> ptype) {
- this.flushEvery = flushEvery;
- this.ptype = ptype;
- }
-
- @Override
- public void initialize() {
- super.initialize();
- ptype.initialize(getConfiguration());
- }
-
- @Override
- public void process(S input, Emitter<Pair<S, Void>> emitter) {
- values.add(ptype.getDetachedValue(input));
- if (values.size() > flushEvery) {
- cleanup(emitter);
- }
- }
-
- @Override
- public void cleanup(Emitter<Pair<S, Void>> emitter) {
- for (S in : values) {
- emitter.emit(Pair.<S, Void>of(in, null));
- }
- values.clear();
- }
- }
-
- private static class PostDistinctFn<S> extends DoFn<Pair<S, Iterable<Void>>, S> {
- @Override
- public void process(Pair<S, Iterable<Void>> input, Emitter<S> emitter) {
- emitter.emit(input.first());
- }
- }
-
- // No instantiation
- private Distinct() {}
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/Join.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/Join.java b/crunch/src/main/java/org/apache/crunch/lib/Join.java
deleted file mode 100644
index c0c4a6b..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/Join.java
+++ /dev/null
@@ -1,181 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import org.apache.crunch.GroupingOptions;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PGroupedTable;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.lib.join.FullOuterJoinFn;
-import org.apache.crunch.lib.join.InnerJoinFn;
-import org.apache.crunch.lib.join.JoinFn;
-import org.apache.crunch.lib.join.JoinUtils;
-import org.apache.crunch.lib.join.LeftOuterJoinFn;
-import org.apache.crunch.lib.join.RightOuterJoinFn;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PTypeFamily;
-
-/**
- * Utilities for joining multiple {@code PTable} instances based on a common
- * lastKey.
- */
-public class Join {
- /**
- * Performs an inner join on the specified {@link PTable}s.
- *
- * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Inner_join">Inner
- * Join</a>
- * @param left
- * A PTable to perform an inner join on.
- * @param right
- * A PTable to perform an inner join on.
- * @param <K>
- * Type of the keys.
- * @param <U>
- * Type of the first {@link PTable}'s values
- * @param <V>
- * Type of the second {@link PTable}'s values
- * @return The joined result.
- */
- public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right) {
- return innerJoin(left, right);
- }
-
- /**
- * Performs an inner join on the specified {@link PTable}s.
- *
- * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Inner_join">Inner
- * Join</a>
- * @param left
- * A PTable to perform an inner join on.
- * @param right
- * A PTable to perform an inner join on.
- * @param <K>
- * Type of the keys.
- * @param <U>
- * Type of the first {@link PTable}'s values
- * @param <V>
- * Type of the second {@link PTable}'s values
- * @return The joined result.
- */
- public static <K, U, V> PTable<K, Pair<U, V>> innerJoin(PTable<K, U> left, PTable<K, V> right) {
- return join(left, right, new InnerJoinFn<K, U, V>(left.getKeyType(), left.getValueType()));
- }
-
- /**
- * Performs a left outer join on the specified {@link PTable}s.
- *
- * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Left_outer_join">Left
- * Join</a>
- * @param left
- * A PTable to perform an left join on. All of this PTable's entries
- * will appear in the resulting PTable.
- * @param right
- * A PTable to perform an left join on.
- * @param <K>
- * Type of the keys.
- * @param <U>
- * Type of the first {@link PTable}'s values
- * @param <V>
- * Type of the second {@link PTable}'s values
- * @return The joined result.
- */
- public static <K, U, V> PTable<K, Pair<U, V>> leftJoin(PTable<K, U> left, PTable<K, V> right) {
- return join(left, right, new LeftOuterJoinFn<K, U, V>(left.getKeyType(), left.getValueType()));
- }
-
- /**
- * Performs a right outer join on the specified {@link PTable}s.
- *
- * @see <a
- * href="http://en.wikipedia.org/wiki/Join_(SQL)#Right_outer_join">Right
- * Join</a>
- * @param left
- * A PTable to perform an right join on.
- * @param right
- * A PTable to perform an right join on. All of this PTable's entries
- * will appear in the resulting PTable.
- * @param <K>
- * Type of the keys.
- * @param <U>
- * Type of the first {@link PTable}'s values
- * @param <V>
- * Type of the second {@link PTable}'s values
- * @return The joined result.
- */
- public static <K, U, V> PTable<K, Pair<U, V>> rightJoin(PTable<K, U> left, PTable<K, V> right) {
- return join(left, right, new RightOuterJoinFn<K, U, V>(left.getKeyType(), left.getValueType()));
- }
-
- /**
- * Performs a full outer join on the specified {@link PTable}s.
- *
- * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Full_outer_join">Full
- * Join</a>
- * @param left
- * A PTable to perform an full join on.
- * @param right
- * A PTable to perform an full join on.
- * @param <K>
- * Type of the keys.
- * @param <U>
- * Type of the first {@link PTable}'s values
- * @param <V>
- * Type of the second {@link PTable}'s values
- * @return The joined result.
- */
- public static <K, U, V> PTable<K, Pair<U, V>> fullJoin(PTable<K, U> left, PTable<K, V> right) {
- return join(left, right, new FullOuterJoinFn<K, U, V>(left.getKeyType(), left.getValueType()));
- }
-
- public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) {
- PTypeFamily ptf = left.getTypeFamily();
- PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right);
- PTableType<K, Pair<U, V>> ret = ptf
- .tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType()));
-
- return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret);
- }
-
- private static <K, U, V> PGroupedTable<Pair<K, Integer>, Pair<U, V>> preJoin(PTable<K, U> left, PTable<K, V> right) {
- PTypeFamily ptf = left.getTypeFamily();
- PTableType<Pair<K, Integer>, Pair<U, V>> ptt = ptf.tableOf(ptf.pairs(left.getKeyType(), ptf.ints()),
- ptf.pairs(left.getValueType(), right.getValueType()));
-
- PTable<Pair<K, Integer>, Pair<U, V>> tag1 = left.parallelDo("joinTagLeft",
- new MapFn<Pair<K, U>, Pair<Pair<K, Integer>, Pair<U, V>>>() {
- @Override
- public Pair<Pair<K, Integer>, Pair<U, V>> map(Pair<K, U> input) {
- return Pair.of(Pair.of(input.first(), 0), Pair.of(input.second(), (V) null));
- }
- }, ptt);
- PTable<Pair<K, Integer>, Pair<U, V>> tag2 = right.parallelDo("joinTagRight",
- new MapFn<Pair<K, V>, Pair<Pair<K, Integer>, Pair<U, V>>>() {
- @Override
- public Pair<Pair<K, Integer>, Pair<U, V>> map(Pair<K, V> input) {
- return Pair.of(Pair.of(input.first(), 1), Pair.of((U) null, input.second()));
- }
- }, ptt);
-
- GroupingOptions.Builder optionsBuilder = GroupingOptions.builder();
- optionsBuilder.partitionerClass(JoinUtils.getPartitionerClass(ptf));
-
- return (tag1.union(tag2)).groupByKey(optionsBuilder.build());
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/PTables.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/PTables.java b/crunch/src/main/java/org/apache/crunch/lib/PTables.java
deleted file mode 100644
index e907680..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/PTables.java
+++ /dev/null
@@ -1,117 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import java.util.List;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PGroupedTable;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.fn.IdentityFn;
-import org.apache.crunch.types.PGroupedTableType;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-
-import com.google.common.collect.Lists;
-
-/**
- * Methods for performing common operations on PTables.
- *
- */
-public class PTables {
-
- /**
- * Convert the given {@code PCollection<Pair<K, V>>} to a {@code PTable<K, V>}.
- * @param pcollect The {@code PCollection} to convert
- * @return A {@code PTable} that contains the same data as the input {@code PCollection}
- */
- public static <K, V> PTable<K, V> asPTable(PCollection<Pair<K, V>> pcollect) {
- PType<Pair<K, V>> pt = pcollect.getPType();
- PTypeFamily ptf = pt.getFamily();
- PTableType<K, V> ptt = ptf.tableOf(pt.getSubTypes().get(0), pt.getSubTypes().get(1));
- DoFn<Pair<K, V>, Pair<K, V>> id = IdentityFn.getInstance();
- return pcollect.parallelDo("asPTable", id, ptt);
- }
-
- /**
- * Extract the keys from the given {@code PTable<K, V>} as a {@code PCollection<K>}.
- * @param ptable The {@code PTable}
- * @return A {@code PCollection<K>}
- */
- public static <K, V> PCollection<K> keys(PTable<K, V> ptable) {
- return ptable.parallelDo("PTables.keys", new DoFn<Pair<K, V>, K>() {
- @Override
- public void process(Pair<K, V> input, Emitter<K> emitter) {
- emitter.emit(input.first());
- }
- }, ptable.getKeyType());
- }
-
- /**
- * Extract the values from the given {@code PTable<K, V>} as a {@code PCollection<V>}.
- * @param ptable The {@code PTable}
- * @return A {@code PCollection<V>}
- */
- public static <K, V> PCollection<V> values(PTable<K, V> ptable) {
- return ptable.parallelDo("PTables.values", new DoFn<Pair<K, V>, V>() {
- @Override
- public void process(Pair<K, V> input, Emitter<V> emitter) {
- emitter.emit(input.second());
- }
- }, ptable.getValueType());
- }
-
- /**
- * Create a detached value for a table {@link Pair}.
- *
- * @param tableType The table type
- * @param value The value from which a detached value is to be created
- * @return The detached value
- * @see PType#getDetachedValue(Object)
- */
- public static <K, V> Pair<K, V> getDetachedValue(PTableType<K, V> tableType, Pair<K, V> value) {
- return Pair.of(tableType.getKeyType().getDetachedValue(value.first()), tableType.getValueType()
- .getDetachedValue(value.second()));
- }
-
- /**
- * Created a detached value for a {@link PGroupedTable} value.
- *
- *
- * @param groupedTableType The grouped table type
- * @param value The value from which a detached value is to be created
- * @return The detached value
- * @see PType#getDetachedValue(Object)
- */
- public static <K, V> Pair<K, Iterable<V>> getGroupedDetachedValue(
- PGroupedTableType<K, V> groupedTableType, Pair<K, Iterable<V>> value) {
-
- PTableType<K, V> tableType = groupedTableType.getTableType();
- List<V> detachedIterable = Lists.newArrayList();
- PType<V> valueType = tableType.getValueType();
- for (V v : value.second()) {
- detachedIterable.add(valueType.getDetachedValue(v));
- }
- return Pair.of(tableType.getKeyType().getDetachedValue(value.first()),
- (Iterable<V>) detachedIterable);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/Sample.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/Sample.java b/crunch/src/main/java/org/apache/crunch/lib/Sample.java
deleted file mode 100644
index 5a66101..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/Sample.java
+++ /dev/null
@@ -1,217 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.lib.SampleUtils.ReservoirSampleFn;
-import org.apache.crunch.lib.SampleUtils.SampleFn;
-import org.apache.crunch.lib.SampleUtils.WRSCombineFn;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-
-/**
- * Methods for performing random sampling in a distributed fashion, either by accepting each
- * record in a {@code PCollection} with an independent probability in order to sample some
- * fraction of the overall data set, or by using reservoir sampling in order to pull a uniform
- * or weighted sample of fixed size from a {@code PCollection} of an unknown size. For more details
- * on the reservoir sampling algorithms used by this library, see the A-ES algorithm described in
- * <a href="http://arxiv.org/pdf/1012.0256.pdf">Efraimidis (2012)</a>.
- */
-public class Sample {
-
- /**
- * Output records from the given {@code PCollection} with the given probability.
- *
- * @param input The {@code PCollection} to sample from
- * @param probability The probability (0.0 < p %lt; 1.0)
- * @return The output {@code PCollection} created from sampling
- */
- public static <S> PCollection<S> sample(PCollection<S> input, double probability) {
- return sample(input, null, probability);
- }
-
- /**
- * Output records from the given {@code PCollection} using a given seed. Useful for unit
- * testing.
- *
- * @param input The {@code PCollection} to sample from
- * @param seed The seed for the random number generator
- * @param probability The probability (0.0 < p < 1.0)
- * @return The output {@code PCollection} created from sampling
- */
- public static <S> PCollection<S> sample(PCollection<S> input, Long seed, double probability) {
- String stageName = String.format("sample(%.2f)", probability);
- return input.parallelDo(stageName, new SampleFn<S>(probability, seed), input.getPType());
- }
-
- /**
- * A {@code PTable<K, V>} analogue of the {@code sample} function.
- *
- * @param input The {@code PTable} to sample from
- * @param probability The probability (0.0 < p < 1.0)
- * @return The output {@code PTable} created from sampling
- */
- public static <K, V> PTable<K, V> sample(PTable<K, V> input, double probability) {
- return PTables.asPTable(sample((PCollection<Pair<K, V>>) input, probability));
- }
-
- /**
- * A {@code PTable<K, V>} analogue of the {@code sample} function, with the seed argument
- * exposed for testing purposes.
- *
- * @param input The {@code PTable} to sample from
- * @param seed The seed for the random number generator
- * @param probability The probability (0.0 < p < 1.0)
- * @return The output {@code PTable} created from sampling
- */
- public static <K, V> PTable<K, V> sample(PTable<K, V> input, Long seed, double probability) {
- return PTables.asPTable(sample((PCollection<Pair<K, V>>) input, seed, probability));
- }
-
- /**
- * Select a fixed number of elements from the given {@code PCollection} with each element
- * equally likely to be included in the sample.
- *
- * @param input The input data
- * @param sampleSize The number of elements to select
- * @return A {@code PCollection} made up of the sampled elements
- */
- public static <T> PCollection<T> reservoirSample(
- PCollection<T> input,
- int sampleSize) {
- return reservorSample(input, sampleSize, null);
- }
-
- /**
- * A version of the reservoir sampling algorithm that uses a given seed, primarily for
- * testing purposes.
- *
- * @param input The input data
- * @param sampleSize The number of elements to select
- * @param seed The test seed
- * @return A {@code PCollection} made up of the sampled elements
-
- */
- public static <T> PCollection<T> reservorSample(
- PCollection<T> input,
- int sampleSize,
- Long seed) {
- PTypeFamily ptf = input.getTypeFamily();
- PType<Pair<T, Integer>> ptype = ptf.pairs(input.getPType(), ptf.ints());
- return weightedReservoirSample(
- input.parallelDo(new MapFn<T, Pair<T, Integer>>() {
- public Pair<T, Integer> map(T t) { return Pair.of(t, 1); }
- }, ptype),
- sampleSize,
- seed);
- }
-
- /**
- * Selects a weighted sample of the elements of the given {@code PCollection}, where the second term in
- * the input {@code Pair} is a numerical weight.
- *
- * @param input the weighted observations
- * @param sampleSize The number of elements to select
- * @return A random sample of the given size that respects the weighting values
- */
- public static <T, N extends Number> PCollection<T> weightedReservoirSample(
- PCollection<Pair<T, N>> input,
- int sampleSize) {
- return weightedReservoirSample(input, sampleSize, null);
- }
-
- /**
- * The weighted reservoir sampling function with the seed term exposed for testing purposes.
- *
- * @param input the weighted observations
- * @param sampleSize The number of elements to select
- * @param seed The test seed
- * @return A random sample of the given size that respects the weighting values
- */
- public static <T, N extends Number> PCollection<T> weightedReservoirSample(
- PCollection<Pair<T, N>> input,
- int sampleSize,
- Long seed) {
- PTypeFamily ptf = input.getTypeFamily();
- PTable<Integer, Pair<T, N>> groupedIn = input.parallelDo(
- new MapFn<Pair<T, N>, Pair<Integer, Pair<T, N>>>() {
- @Override
- public Pair<Integer, Pair<T, N>> map(Pair<T, N> p) {
- return Pair.of(0, p);
- }
- }, ptf.tableOf(ptf.ints(), input.getPType()));
- int[] ss = new int[] { sampleSize };
- return groupedWeightedReservoirSample(groupedIn, ss, seed)
- .parallelDo(new MapFn<Pair<Integer, T>, T>() {
- @Override
- public T map(Pair<Integer, T> p) {
- return p.second();
- }
- }, (PType<T>) input.getPType().getSubTypes().get(0));
- }
-
- /**
- * The most general purpose of the weighted reservoir sampling patterns that allows us to choose
- * a random sample of elements for each of N input groups.
- *
- * @param input A {@code PTable} with the key a group ID and the value a weighted observation in that group
- * @param sampleSizes An array of length N, with each entry is the number of elements to include in that group
- * @return A {@code PCollection} of the sampled elements for each of the groups
- */
-
- public static <T, N extends Number> PCollection<Pair<Integer, T>> groupedWeightedReservoirSample(
- PTable<Integer, Pair<T, N>> input,
- int[] sampleSizes) {
- return groupedWeightedReservoirSample(input, sampleSizes, null);
- }
-
- /**
- * Same as the other groupedWeightedReservoirSample method, but include a seed for testing
- * purposes.
- *
- * @param input A {@code PTable} with the key a group ID and the value a weighted observation in that group
- * @param sampleSizes An array of length N, with each entry is the number of elements to include in that group
- * @param seed The test seed
- * @return A {@code PCollection} of the sampled elements for each of the groups
- */
- public static <T, N extends Number> PCollection<Pair<Integer, T>> groupedWeightedReservoirSample(
- PTable<Integer, Pair<T, N>> input,
- int[] sampleSizes,
- Long seed) {
- PTypeFamily ptf = input.getTypeFamily();
- PType<T> ttype = (PType<T>) input.getPTableType().getValueType().getSubTypes().get(0);
- PTableType<Integer, Pair<Double, T>> ptt = ptf.tableOf(ptf.ints(),
- ptf.pairs(ptf.doubles(), ttype));
-
- return input.parallelDo(new ReservoirSampleFn<T, N>(sampleSizes, seed, ttype), ptt)
- .groupByKey(1)
- .combineValues(new WRSCombineFn<T>(sampleSizes, ttype))
- .parallelDo(new MapFn<Pair<Integer, Pair<Double, T>>, Pair<Integer, T>>() {
- @Override
- public Pair<Integer, T> map(Pair<Integer, Pair<Double, T>> p) {
- return Pair.of(p.first(), p.second().second());
- }
- }, ptf.pairs(ptf.ints(), ttype));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/SampleUtils.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/SampleUtils.java b/crunch/src/main/java/org/apache/crunch/lib/SampleUtils.java
deleted file mode 100644
index 8769eed..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/SampleUtils.java
+++ /dev/null
@@ -1,168 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.SortedMap;
-
-import org.apache.crunch.CombineFn;
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.FilterFn;
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.PType;
-
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-class SampleUtils {
-
- static class SampleFn<S> extends FilterFn<S> {
-
- private final Long seed;
- private final double acceptanceProbability;
- private transient Random r;
-
- public SampleFn(double acceptanceProbability, Long seed) {
- Preconditions.checkArgument(0.0 < acceptanceProbability && acceptanceProbability < 1.0);
- this.seed = seed == null ? System.currentTimeMillis() : seed;
- this.acceptanceProbability = acceptanceProbability;
- }
-
- @Override
- public void initialize() {
- if (r == null) {
- r = new Random(seed);
- }
- }
-
- @Override
- public boolean accept(S input) {
- return r.nextDouble() < acceptanceProbability;
- }
- }
-
-
- static class ReservoirSampleFn<T, N extends Number>
- extends DoFn<Pair<Integer, Pair<T, N>>, Pair<Integer, Pair<Double, T>>> {
-
- private int[] sampleSizes;
- private Long seed;
- private PType<T> valueType;
- private transient List<SortedMap<Double, T>> reservoirs;
- private transient Random random;
-
- public ReservoirSampleFn(int[] sampleSizes, Long seed, PType<T> valueType) {
- this.sampleSizes = sampleSizes;
- this.seed = seed;
- this.valueType = valueType;
- }
-
- @Override
- public void initialize() {
- this.reservoirs = Lists.newArrayList();
- this.valueType.initialize(getConfiguration());
- for (int i = 0; i < sampleSizes.length; i++) {
- reservoirs.add(Maps.<Double, T>newTreeMap());
- }
- if (random == null) {
- if (seed == null) {
- this.random = new Random();
- } else {
- this.random = new Random(seed);
- }
- }
- }
-
- @Override
- public void process(Pair<Integer, Pair<T, N>> input,
- Emitter<Pair<Integer, Pair<Double, T>>> emitter) {
- int id = input.first();
- Pair<T, N> p = input.second();
- double weight = p.second().doubleValue();
- if (weight > 0.0) {
- double score = Math.log(random.nextDouble()) / weight;
- SortedMap<Double, T> reservoir = reservoirs.get(id);
- if (reservoir.size() < sampleSizes[id]) {
- reservoir.put(score, valueType.getDetachedValue(p.first()));
- } else if (score > reservoir.firstKey()) {
- reservoir.remove(reservoir.firstKey());
- reservoir.put(score, valueType.getDetachedValue(p.first()));
- }
- }
- }
-
- @Override
- public void cleanup(Emitter<Pair<Integer, Pair<Double, T>>> emitter) {
- for (int id = 0; id < reservoirs.size(); id++) {
- SortedMap<Double, T> reservoir = reservoirs.get(id);
- for (Map.Entry<Double, T> e : reservoir.entrySet()) {
- emitter.emit(Pair.of(id, Pair.of(e.getKey(), e.getValue())));
- }
- }
- }
- }
-
- static class WRSCombineFn<T> extends CombineFn<Integer, Pair<Double, T>> {
-
- private int[] sampleSizes;
- private PType<T> valueType;
- private List<SortedMap<Double, T>> reservoirs;
-
- public WRSCombineFn(int[] sampleSizes, PType<T> valueType) {
- this.sampleSizes = sampleSizes;
- this.valueType = valueType;
- }
-
- @Override
- public void initialize() {
- this.reservoirs = Lists.newArrayList();
- for (int i = 0; i < sampleSizes.length; i++) {
- reservoirs.add(Maps.<Double, T>newTreeMap());
- }
- this.valueType.initialize(getConfiguration());
- }
-
- @Override
- public void process(Pair<Integer, Iterable<Pair<Double, T>>> input,
- Emitter<Pair<Integer, Pair<Double, T>>> emitter) {
- SortedMap<Double, T> reservoir = reservoirs.get(input.first());
- for (Pair<Double, T> p : input.second()) {
- if (reservoir.size() < sampleSizes[input.first()]) {
- reservoir.put(p.first(), valueType.getDetachedValue(p.second()));
- } else if (p.first() > reservoir.firstKey()) {
- reservoir.remove(reservoir.firstKey());
- reservoir.put(p.first(), valueType.getDetachedValue(p.second()));
- }
- }
- }
-
- @Override
- public void cleanup(Emitter<Pair<Integer, Pair<Double, T>>> emitter) {
- for (int i = 0; i < reservoirs.size(); i++) {
- SortedMap<Double, T> reservoir = reservoirs.get(i);
- for (Map.Entry<Double, T> e : reservoir.entrySet()) {
- emitter.emit(Pair.of(i, Pair.of(e.getKey(), e.getValue())));
- }
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/SecondarySort.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/SecondarySort.java b/crunch/src/main/java/org/apache/crunch/lib/SecondarySort.java
deleted file mode 100644
index 54b4396..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/SecondarySort.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import java.util.Collection;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.GroupingOptions;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PGroupedTable;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.lib.join.JoinUtils;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * Utilities for performing a secondary sort on a {@code PTable<K, Pair<V1, V2>>} collection.
- * <p>
- * Secondary sorts are usually performed during sessionization: given a collection
- * of events, we want to group them by a key (such as a user ID), then sort the grouped
- * records by an auxillary key (such as a timestamp), and then perform some additional
- * processing on the sorted records.
- */
-public class SecondarySort {
-
- /**
- * Perform a secondary sort on the given {@code PTable} instance and then apply a
- * {@code DoFn} to the resulting sorted data to yield an output {@code PCollection<T>}.
- */
- public static <K, V1, V2, T> PCollection<T> sortAndApply(PTable<K, Pair<V1, V2>> input,
- DoFn<Pair<K, Iterable<Pair<V1, V2>>>, T> doFn, PType<T> ptype) {
- return prepare(input)
- .parallelDo("SecondarySort.apply", new SSWrapFn<K, V1, V2, T>(doFn), ptype);
- }
-
- /**
- * Perform a secondary sort on the given {@code PTable} instance and then apply a
- * {@code DoFn} to the resulting sorted data to yield an output {@code PTable<U, V>}.
- */
- public static <K, V1, V2, U, V> PTable<U, V> sortAndApply(PTable<K, Pair<V1, V2>> input,
- DoFn<Pair<K, Iterable<Pair<V1, V2>>>, Pair<U, V>> doFn, PTableType<U, V> ptype) {
- return prepare(input)
- .parallelDo("SecondarySort.apply", new SSWrapFn<K, V1, V2, Pair<U, V>>(doFn), ptype);
- }
-
- private static <K, V1, V2> PGroupedTable<Pair<K, V1>, Pair<V1, V2>> prepare(
- PTable<K, Pair<V1, V2>> input) {
- PTypeFamily ptf = input.getTypeFamily();
- PType<Pair<V1, V2>> valueType = input.getValueType();
- PTableType<Pair<K, V1>, Pair<V1, V2>> inter = ptf.tableOf(
- ptf.pairs(input.getKeyType(), valueType.getSubTypes().get(0)),
- valueType);
- PTableType<K, Collection<Pair<V1, V2>>> out = ptf.tableOf(input.getKeyType(),
- ptf.collections(input.getValueType()));
- return input.parallelDo("SecondarySort.format", new SSFormatFn<K, V1, V2>(), inter)
- .groupByKey(
- GroupingOptions.builder()
- .groupingComparatorClass(JoinUtils.getGroupingComparator(ptf))
- .partitionerClass(JoinUtils.getPartitionerClass(ptf))
- .build());
- }
-
- private static class SSFormatFn<K, V1, V2> extends MapFn<Pair<K, Pair<V1, V2>>, Pair<Pair<K, V1>, Pair<V1, V2>>> {
- @Override
- public Pair<Pair<K, V1>, Pair<V1, V2>> map(Pair<K, Pair<V1, V2>> input) {
- return Pair.of(Pair.of(input.first(), input.second().first()), input.second());
- }
- }
-
- private static class SSWrapFn<K, V1, V2, T> extends DoFn<Pair<Pair<K, V1>, Iterable<Pair<V1, V2>>>, T> {
- private final DoFn<Pair<K, Iterable<Pair<V1, V2>>>, T> intern;
-
- public SSWrapFn(DoFn<Pair<K, Iterable<Pair<V1, V2>>>, T> intern) {
- this.intern = intern;
- }
-
- @Override
- public void configure(Configuration conf) {
- intern.configure(conf);
- }
-
- @Override
- public void initialize() {
- intern.setContext(getContext());
- intern.initialize();
- }
-
- @Override
- public void process(Pair<Pair<K, V1>, Iterable<Pair<V1, V2>>> input, Emitter<T> emitter) {
- intern.process(Pair.of(input.first().first(), input.second()), emitter);
- }
-
- @Override
- public void cleanup(Emitter<T> emitter) {
- intern.cleanup(emitter);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/Set.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/Set.java b/crunch/src/main/java/org/apache/crunch/lib/Set.java
deleted file mode 100644
index 0ba879c..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/Set.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import java.util.Collection;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-
-/**
- * Utilities for performing set operations (difference, intersection, etc) on
- * {@code PCollection} instances.
- */
-public class Set {
-
- /**
- * Compute the set difference between two sets of elements.
- *
- * @return a collection containing elements that are in <code>coll1</code> but
- * not in <code>coll2</code>
- */
- public static <T> PCollection<T> difference(PCollection<T> coll1, PCollection<T> coll2) {
- return Cogroup.cogroup(toTable(coll1), toTable(coll2)).parallelDo(
- new DoFn<Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>>, T>() {
- @Override
- public void process(Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>> input, Emitter<T> emitter) {
- Pair<Collection<Boolean>, Collection<Boolean>> groups = input.second();
- if (!groups.first().isEmpty() && groups.second().isEmpty()) {
- emitter.emit(input.first());
- }
- }
- }, coll1.getPType());
- }
-
- /**
- * Compute the intersection of two sets of elements.
- *
- * @return a collection containing elements that common to both sets
- * <code>coll1</code> and <code>coll2</code>
- */
- public static <T> PCollection<T> intersection(PCollection<T> coll1, PCollection<T> coll2) {
- return Cogroup.cogroup(toTable(coll1), toTable(coll2)).parallelDo(
- new DoFn<Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>>, T>() {
- @Override
- public void process(Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>> input, Emitter<T> emitter) {
- Pair<Collection<Boolean>, Collection<Boolean>> groups = input.second();
- if (!groups.first().isEmpty() && !groups.second().isEmpty()) {
- emitter.emit(input.first());
- }
- }
- }, coll1.getPType());
- }
-
- /**
- * Find the elements that are common to two sets, like the Unix
- * <code>comm</code> utility. This method returns a {@link PCollection} of
- * {@link Tuple3} objects, and the position in the tuple that an element
- * appears is determined by the collections that it is a member of, as
- * follows:
- * <ol>
- * <li>elements only in <code>coll1</code>,</li>
- * <li>elements only in <code>coll2</code>, or</li>
- * <li>elements in both collections</li>
- * </ol>
- * Tuples are otherwise filled with <code>null</code>.
- *
- * @return a collection of {@link Tuple3} objects
- */
- public static <T> PCollection<Tuple3<T, T, T>> comm(PCollection<T> coll1, PCollection<T> coll2) {
- PTypeFamily typeFamily = coll1.getTypeFamily();
- PType<T> type = coll1.getPType();
- return Cogroup.cogroup(toTable(coll1), toTable(coll2)).parallelDo(
- new DoFn<Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>>, Tuple3<T, T, T>>() {
- @Override
- public void process(Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>> input,
- Emitter<Tuple3<T, T, T>> emitter) {
- Pair<Collection<Boolean>, Collection<Boolean>> groups = input.second();
- boolean inFirst = !groups.first().isEmpty();
- boolean inSecond = !groups.second().isEmpty();
- T t = input.first();
- emitter.emit(Tuple3.of(inFirst && !inSecond ? t : null, !inFirst && inSecond ? t : null, inFirst
- && inSecond ? t : null));
- }
- }, typeFamily.triples(type, type, type));
- }
-
- private static <T> PTable<T, Boolean> toTable(PCollection<T> coll) {
- PTypeFamily typeFamily = coll.getTypeFamily();
- return coll.parallelDo(new DoFn<T, Pair<T, Boolean>>() {
- @Override
- public void process(T input, Emitter<Pair<T, Boolean>> emitter) {
- emitter.emit(Pair.of(input, Boolean.TRUE));
- }
- }, typeFamily.tableOf(coll.getPType(), typeFamily.booleans()));
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/Sort.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/Sort.java b/crunch/src/main/java/org/apache/crunch/lib/Sort.java
deleted file mode 100644
index 23bcaee..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/Sort.java
+++ /dev/null
@@ -1,294 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib;
-
-import static org.apache.crunch.lib.sort.Comparators.*;
-import static org.apache.crunch.lib.sort.SortFns.*;
-
-import org.apache.avro.Schema;
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.GroupingOptions;
-import org.apache.crunch.GroupingOptions.Builder;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.Tuple;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-import org.apache.crunch.lib.sort.TotalOrderPartitioner;
-import org.apache.crunch.materialize.MaterializableIterable;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.avro.AvroType;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.apache.crunch.util.PartitionUtils;
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * Utilities for sorting {@code PCollection} instances.
- */
-public class Sort {
-
- /**
- * For signaling the order in which a sort should be done.
- */
- public enum Order {
- ASCENDING,
- DESCENDING,
- IGNORE
- }
-
- /**
- * To sort by column 2 ascending then column 1 descending, you would use:
- * <code>
- * sortPairs(coll, by(2, ASCENDING), by(1, DESCENDING))
- * </code> Column numbering is 1-based.
- */
- public static class ColumnOrder {
- private int column;
- private Order order;
-
- public ColumnOrder(int column, Order order) {
- this.column = column;
- this.order = order;
- }
-
- public static ColumnOrder by(int column, Order order) {
- return new ColumnOrder(column, order);
- }
-
- public int column() {
- return column;
- }
-
- public Order order() {
- return order;
- }
-
- @Override
- public String toString() {
- return "ColumnOrder: column:" + column + ", Order: " + order;
- }
- }
-
- /**
- * Sorts the {@code PCollection} using the natural ordering of its elements in ascending order.
- *
- * @return a {@code PCollection} representing the sorted collection.
- */
- public static <T> PCollection<T> sort(PCollection<T> collection) {
- return sort(collection, Order.ASCENDING);
- }
-
- /**
- * Sorts the {@code PCollection} using the natural order of its elements with the given {@code Order}.
- *
- * @return a {@code PCollection} representing the sorted collection.
- */
- public static <T> PCollection<T> sort(PCollection<T> collection, Order order) {
- return sort(collection, -1, order);
- }
-
- /**
- * Sorts the {@code PCollection} using the natural ordering of its elements in
- * the order specified using the given number of reducers.
- *
- * @return a {@code PCollection} representing the sorted collection.
- */
- public static <T> PCollection<T> sort(PCollection<T> collection, int numReducers, Order order) {
- PTypeFamily tf = collection.getTypeFamily();
- PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls());
- Configuration conf = collection.getPipeline().getConfiguration();
- PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() {
- @Override
- public void process(T input, Emitter<Pair<T, Void>> emitter) {
- emitter.emit(Pair.of(input, (Void) null));
- }
- }, type);
- GroupingOptions options = buildGroupingOptions(pt, conf, numReducers, order);
- return pt.groupByKey(options).ungroup().keys();
- }
-
- /**
- * Sorts the {@code PTable} using the natural ordering of its keys in ascending order.
- *
- * @return a {@code PTable} representing the sorted table.
- */
- public static <K, V> PTable<K, V> sort(PTable<K, V> table) {
- return sort(table, Order.ASCENDING);
- }
-
- /**
- * Sorts the {@code PTable} using the natural ordering of its keys with the given {@code Order}.
- *
- * @return a {@code PTable} representing the sorted table.
- */
- public static <K, V> PTable<K, V> sort(PTable<K, V> table, Order key) {
- return sort(table, -1, key);
- }
-
- /**
- * Sorts the {@code PTable} using the natural ordering of its keys in the
- * order specified with a client-specified number of reducers.
- *
- * @return a {@code PTable} representing the sorted collection.
- */
- public static <K, V> PTable<K, V> sort(PTable<K, V> table, int numReducers, Order key) {
- Configuration conf = table.getPipeline().getConfiguration();
- GroupingOptions options = buildGroupingOptions(table, conf, numReducers, key);
- return table.groupByKey(options).ungroup();
- }
-
-
- /**
- * Sorts the {@code PCollection} of {@code Pair}s using the specified column
- * ordering.
- *
- * @return a {@code PCollection} representing the sorted collection.
- */
- public static <U, V> PCollection<Pair<U, V>> sortPairs(PCollection<Pair<U, V>> collection,
- ColumnOrder... columnOrders) {
- return sortTuples(collection, columnOrders);
- }
-
- /**
- * Sorts the {@code PCollection} of {@code Tuple3}s using the specified column
- * ordering.
- *
- * @return a {@code PCollection} representing the sorted collection.
- */
- public static <V1, V2, V3> PCollection<Tuple3<V1, V2, V3>> sortTriples(PCollection<Tuple3<V1, V2, V3>> collection,
- ColumnOrder... columnOrders) {
- return sortTuples(collection, columnOrders);
- }
-
- /**
- * Sorts the {@code PCollection} of {@code Tuple4}s using the specified column
- * ordering.
- *
- * @return a {@code PCollection} representing the sorted collection.
- */
- public static <V1, V2, V3, V4> PCollection<Tuple4<V1, V2, V3, V4>> sortQuads(
- PCollection<Tuple4<V1, V2, V3, V4>> collection, ColumnOrder... columnOrders) {
- return sortTuples(collection, columnOrders);
- }
-
- /**
- * Sorts the {@code PCollection} of tuples using the specified column ordering.
- *
- * @return a {@code PCollection} representing the sorted collection.
- */
- public static <T extends Tuple> PCollection<T> sortTuples(PCollection<T> collection,
- ColumnOrder... columnOrders) {
- return sortTuples(collection, -1, columnOrders);
- }
-
- /**
- * Sorts the {@code PCollection} of {@link TupleN}s using the specified column
- * ordering and a client-specified number of reducers.
- *
- * @return a {@code PCollection} representing the sorted collection.
- */
- public static <T extends Tuple> PCollection<T> sortTuples(PCollection<T> collection, int numReducers,
- ColumnOrder... columnOrders) {
- PType<T> pType = collection.getPType();
- KeyExtraction<T> ke = new KeyExtraction<T>(pType, columnOrders);
- PTable<Object, T> pt = collection.by(ke.getByFn(), ke.getKeyType());
- Configuration conf = collection.getPipeline().getConfiguration();
- GroupingOptions options = buildGroupingOptions(pt, conf, numReducers, columnOrders);
- return pt.groupByKey(options).ungroup().values();
- }
-
- // TODO: move to type family?
- private static <K, V> GroupingOptions buildGroupingOptions(PTable<K, V> ptable, Configuration conf,
- int numReducers, Order order) {
- PType<K> ptype = ptable.getKeyType();
- PTypeFamily tf = ptable.getTypeFamily();
- Builder builder = GroupingOptions.builder();
- if (order == Order.DESCENDING) {
- if (tf == WritableTypeFamily.getInstance()) {
- builder.sortComparatorClass(ReverseWritableComparator.class);
- } else if (tf == AvroTypeFamily.getInstance()) {
- AvroType<K> avroType = (AvroType<K>) ptype;
- Schema schema = avroType.getSchema();
- builder.conf("crunch.schema", schema.toString());
- builder.sortComparatorClass(ReverseAvroComparator.class);
- } else {
- throw new RuntimeException("Unrecognized type family: " + tf);
- }
- } else if (tf == AvroTypeFamily.getInstance()) {
- builder.conf("crunch.schema", ((AvroType<K>) ptype).getSchema().toString());
- }
- configureReducers(builder, ptable, conf, numReducers);
- return builder.build();
- }
-
- private static <K, V> GroupingOptions buildGroupingOptions(PTable<K, V> ptable, Configuration conf,
- int numReducers, ColumnOrder[] columnOrders) {
- PTypeFamily tf = ptable.getTypeFamily();
- PType<K> keyType = ptable.getKeyType();
- Builder builder = GroupingOptions.builder();
- if (tf == WritableTypeFamily.getInstance()) {
- if (columnOrders.length == 1 && columnOrders[0].order == Order.DESCENDING) {
- builder.sortComparatorClass(ReverseWritableComparator.class);
- } else {
- TupleWritableComparator.configureOrdering(conf, columnOrders);
- builder.sortComparatorClass(TupleWritableComparator.class);
- }
- } else if (tf == AvroTypeFamily.getInstance()) {
- AvroType<K> avroType = (AvroType<K>) keyType;
- Schema schema = avroType.getSchema();
- builder.conf("crunch.schema", schema.toString());
- if (columnOrders.length == 1 && columnOrders[0].order == Order.DESCENDING) {
- builder.sortComparatorClass(ReverseAvroComparator.class);
- }
- } else {
- throw new RuntimeException("Unrecognized type family: " + tf);
- }
- configureReducers(builder, ptable, conf, numReducers);
- return builder.build();
- }
-
- private static <K, V> void configureReducers(GroupingOptions.Builder builder,
- PTable<K, V> ptable, Configuration conf, int numReducers) {
- if (numReducers <= 0) {
- numReducers = PartitionUtils.getRecommendedPartitions(ptable, conf);
- if (numReducers < 5) {
- // Not worth the overhead, force it to 1
- numReducers = 1;
- }
- }
- builder.numReducers(numReducers);
- if (numReducers > 1) {
- Iterable<K> iter = Sample.reservoirSample(ptable.keys(), numReducers - 1).materialize();
- MaterializableIterable<K> mi = (MaterializableIterable<K>) iter;
- if (mi.isSourceTarget()) {
- builder.sourceTarget((SourceTarget) mi.getSource());
- }
- builder.partitionerClass(TotalOrderPartitioner.class);
- builder.conf(TotalOrderPartitioner.PARTITIONER_PATH, mi.getPath().toString());
- //TODO: distcache handling
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/join/FullOuterJoinFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/join/FullOuterJoinFn.java b/crunch/src/main/java/org/apache/crunch/lib/join/FullOuterJoinFn.java
deleted file mode 100644
index c0ce727..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/join/FullOuterJoinFn.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import java.util.List;
-
-import org.apache.crunch.Emitter;
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.PType;
-
-import com.google.common.collect.Lists;
-
-/**
- * Used to perform the last step of an full outer join.
- *
- * @param <K> Type of the keys.
- * @param <U> Type of the first {@link org.apache.crunch.PTable}'s values
- * @param <V> Type of the second {@link org.apache.crunch.PTable}'s values
- */
-public class FullOuterJoinFn<K, U, V> extends JoinFn<K, U, V> {
-
- private transient int lastId;
- private transient K lastKey;
- private transient List<U> leftValues;
-
- public FullOuterJoinFn(PType<K> keyType, PType<U> leftValueType) {
- super(keyType, leftValueType);
- }
-
- /** {@inheritDoc} */
- @Override
- public void initialize() {
- super.initialize();
- lastId = 1;
- lastKey = null;
- this.leftValues = Lists.newArrayList();
- }
-
- /** {@inheritDoc} */
- @Override
- public void join(K key, int id, Iterable<Pair<U, V>> pairs, Emitter<Pair<K, Pair<U, V>>> emitter) {
- if (!key.equals(lastKey)) {
- // Make sure that left side gets emitted.
- if (0 == lastId) {
- for (U u : leftValues) {
- emitter.emit(Pair.of(lastKey, Pair.of(u, (V) null)));
- }
- }
- lastKey = keyType.getDetachedValue(key);
- leftValues.clear();
- }
- if (id == 0) {
- for (Pair<U, V> pair : pairs) {
- if (pair.first() != null)
- leftValues.add(leftValueType.getDetachedValue(pair.first()));
- }
- } else {
- for (Pair<U, V> pair : pairs) {
- // Make sure that right side gets emitted.
- if (leftValues.isEmpty()) {
- leftValues.add(null);
- }
- for (U u : leftValues) {
- emitter.emit(Pair.of(lastKey, Pair.of(u, pair.second())));
- }
- }
- }
-
- lastId = id;
- }
-
- /** {@inheritDoc} */
- @Override
- public void cleanup(Emitter<Pair<K, Pair<U, V>>> emitter) {
- if (0 == lastId) {
- for (U u : leftValues) {
- emitter.emit(Pair.of(lastKey, Pair.of(u, (V) null)));
- }
- }
- }
-
- /** {@inheritDoc} */
- @Override
- public String getJoinType() {
- return "fullOuterJoin";
- }
-}
[32/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/JobPrototype.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/JobPrototype.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/JobPrototype.java
new file mode 100644
index 0000000..f22b5a1
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/JobPrototype.java
@@ -0,0 +1,245 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.plan;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.Target;
+import org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchControlledJob;
+import org.apache.crunch.impl.mr.collect.DoTableImpl;
+import org.apache.crunch.impl.mr.collect.PCollectionImpl;
+import org.apache.crunch.impl.mr.collect.PGroupedTableImpl;
+import org.apache.crunch.impl.mr.exec.CrunchJobHooks;
+import org.apache.crunch.impl.mr.run.CrunchCombiner;
+import org.apache.crunch.impl.mr.run.CrunchInputFormat;
+import org.apache.crunch.impl.mr.run.CrunchMapper;
+import org.apache.crunch.impl.mr.run.CrunchReducer;
+import org.apache.crunch.impl.mr.run.NodeContext;
+import org.apache.crunch.impl.mr.run.RTNode;
+import org.apache.crunch.util.DistCache;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+
+class JobPrototype {
+
+ public static JobPrototype createMapReduceJob(int jobID, PGroupedTableImpl<?, ?> group,
+ Set<NodePath> inputs, Path workingPath) {
+ return new JobPrototype(jobID, inputs, group, workingPath);
+ }
+
+ public static JobPrototype createMapOnlyJob(int jobID, HashMultimap<Target, NodePath> mapNodePaths, Path workingPath) {
+ return new JobPrototype(jobID, mapNodePaths, workingPath);
+ }
+
+ private final int jobID; // TODO: maybe stageID sounds better
+ private final Set<NodePath> mapNodePaths;
+ private final PGroupedTableImpl<?, ?> group;
+ private final Set<JobPrototype> dependencies = Sets.newHashSet();
+ private final Map<PCollectionImpl<?>, DoNode> nodes = Maps.newHashMap();
+ private final Path workingPath;
+
+ private HashMultimap<Target, NodePath> targetsToNodePaths;
+ private DoTableImpl<?, ?> combineFnTable;
+
+ private CrunchControlledJob job;
+
+ private JobPrototype(int jobID, Set<NodePath> inputs, PGroupedTableImpl<?, ?> group, Path workingPath) {
+ this.jobID = jobID;
+ this.mapNodePaths = ImmutableSet.copyOf(inputs);
+ this.group = group;
+ this.workingPath = workingPath;
+ this.targetsToNodePaths = null;
+ }
+
+ private JobPrototype(int jobID, HashMultimap<Target, NodePath> outputPaths, Path workingPath) {
+ this.jobID = jobID;
+ this.group = null;
+ this.mapNodePaths = null;
+ this.workingPath = workingPath;
+ this.targetsToNodePaths = outputPaths;
+ }
+
+ public int getJobID() {
+ return jobID;
+ }
+
+ public boolean isMapOnly() {
+ return this.group == null;
+ }
+
+ Set<NodePath> getMapNodePaths() {
+ return mapNodePaths;
+ }
+
+ PGroupedTableImpl<?, ?> getGroupingTable() {
+ return group;
+ }
+
+ HashMultimap<Target, NodePath> getTargetsToNodePaths() {
+ return targetsToNodePaths;
+ }
+
+ public void addReducePaths(HashMultimap<Target, NodePath> outputPaths) {
+ if (group == null) {
+ throw new IllegalStateException("Cannot add a reduce phase to a map-only job");
+ }
+ this.targetsToNodePaths = outputPaths;
+ }
+
+ public void addDependency(JobPrototype dependency) {
+ this.dependencies.add(dependency);
+ }
+
+ public CrunchControlledJob getCrunchJob(Class<?> jarClass, Configuration conf, Pipeline pipeline) throws IOException {
+ if (job == null) {
+ job = build(jarClass, conf, pipeline);
+ for (JobPrototype proto : dependencies) {
+ job.addDependingJob(proto.getCrunchJob(jarClass, conf, pipeline));
+ }
+ }
+ return job;
+ }
+
+ private CrunchControlledJob build(Class<?> jarClass, Configuration conf, Pipeline pipeline) throws IOException {
+ Job job = new Job(conf);
+ conf = job.getConfiguration();
+ conf.set(PlanningParameters.CRUNCH_WORKING_DIRECTORY, workingPath.toString());
+ job.setJarByClass(jarClass);
+
+ Set<DoNode> outputNodes = Sets.newHashSet();
+ Set<Target> targets = targetsToNodePaths.keySet();
+ Path outputPath = new Path(workingPath, "output");
+ MSCROutputHandler outputHandler = new MSCROutputHandler(job, outputPath, group == null);
+ for (Target target : targets) {
+ DoNode node = null;
+ for (NodePath nodePath : targetsToNodePaths.get(target)) {
+ if (node == null) {
+ PCollectionImpl<?> collect = nodePath.tail();
+ node = DoNode.createOutputNode(target.toString(), collect.getPType());
+ outputHandler.configureNode(node, target);
+ }
+ outputNodes.add(walkPath(nodePath.descendingIterator(), node));
+ }
+ }
+
+ job.setMapperClass(CrunchMapper.class);
+ List<DoNode> inputNodes;
+ DoNode reduceNode = null;
+ if (group != null) {
+ job.setReducerClass(CrunchReducer.class);
+ List<DoNode> reduceNodes = Lists.newArrayList(outputNodes);
+ serialize(reduceNodes, conf, workingPath, NodeContext.REDUCE);
+ reduceNode = reduceNodes.get(0);
+
+ if (combineFnTable != null) {
+ job.setCombinerClass(CrunchCombiner.class);
+ DoNode combinerInputNode = group.createDoNode();
+ DoNode combineNode = combineFnTable.createDoNode();
+ combineNode.addChild(group.getGroupingNode());
+ combinerInputNode.addChild(combineNode);
+ serialize(ImmutableList.of(combinerInputNode), conf, workingPath, NodeContext.COMBINE);
+ }
+
+ group.configureShuffle(job);
+
+ DoNode mapOutputNode = group.getGroupingNode();
+ Set<DoNode> mapNodes = Sets.newHashSet();
+ for (NodePath nodePath : mapNodePaths) {
+ // Advance these one step, since we've already configured
+ // the grouping node, and the PGroupedTableImpl is the tail
+ // of the NodePath.
+ Iterator<PCollectionImpl<?>> iter = nodePath.descendingIterator();
+ iter.next();
+ mapNodes.add(walkPath(iter, mapOutputNode));
+ }
+ inputNodes = Lists.newArrayList(mapNodes);
+ } else { // No grouping
+ job.setNumReduceTasks(0);
+ inputNodes = Lists.newArrayList(outputNodes);
+ }
+ serialize(inputNodes, conf, workingPath, NodeContext.MAP);
+
+ if (inputNodes.size() == 1) {
+ DoNode inputNode = inputNodes.get(0);
+ inputNode.getSource().configureSource(job, -1);
+ } else {
+ for (int i = 0; i < inputNodes.size(); i++) {
+ DoNode inputNode = inputNodes.get(i);
+ inputNode.getSource().configureSource(job, i);
+ }
+ job.setInputFormatClass(CrunchInputFormat.class);
+ }
+ job.setJobName(createJobName(pipeline.getName(), inputNodes, reduceNode));
+
+ return new CrunchControlledJob(
+ jobID,
+ job,
+ new CrunchJobHooks.PrepareHook(job),
+ new CrunchJobHooks.CompletionHook(job, outputPath, outputHandler.getMultiPaths(), group == null));
+ }
+
+ private void serialize(List<DoNode> nodes, Configuration conf, Path workingPath, NodeContext context)
+ throws IOException {
+ List<RTNode> rtNodes = Lists.newArrayList();
+ for (DoNode node : nodes) {
+ rtNodes.add(node.toRTNode(true, conf, context));
+ }
+ Path path = new Path(workingPath, context.toString());
+ DistCache.write(conf, path, rtNodes);
+ }
+
+ private String createJobName(String pipelineName, List<DoNode> mapNodes, DoNode reduceNode) {
+ JobNameBuilder builder = new JobNameBuilder(pipelineName);
+ builder.visit(mapNodes);
+ if (reduceNode != null) {
+ builder.visit(reduceNode);
+ }
+ return builder.build();
+ }
+
+ private DoNode walkPath(Iterator<PCollectionImpl<?>> iter, DoNode working) {
+ while (iter.hasNext()) {
+ PCollectionImpl<?> collect = iter.next();
+ if (combineFnTable != null && !(collect instanceof PGroupedTableImpl)) {
+ combineFnTable = null;
+ } else if (collect instanceof DoTableImpl && ((DoTableImpl<?, ?>) collect).hasCombineFn()) {
+ combineFnTable = (DoTableImpl<?, ?>) collect;
+ }
+ if (!nodes.containsKey(collect)) {
+ nodes.put(collect, collect.createDoNode());
+ }
+ DoNode parent = nodes.get(collect);
+ parent.addChild(working);
+ working = parent;
+ }
+ return working;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/MSCROutputHandler.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/MSCROutputHandler.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/MSCROutputHandler.java
new file mode 100644
index 0000000..36c565e
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/MSCROutputHandler.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.plan;
+
+import java.util.Map;
+
+import org.apache.crunch.Target;
+import org.apache.crunch.io.MapReduceTarget;
+import org.apache.crunch.io.OutputHandler;
+import org.apache.crunch.io.PathTarget;
+import org.apache.crunch.types.PType;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+
+import com.google.common.collect.Maps;
+
+public class MSCROutputHandler implements OutputHandler {
+
+ private final Job job;
+ private final Path path;
+ private final boolean mapOnlyJob;
+
+ private DoNode workingNode;
+ private Map<Integer, PathTarget> multiPaths;
+ private int jobCount;
+
+ public MSCROutputHandler(Job job, Path outputPath, boolean mapOnlyJob) {
+ this.job = job;
+ this.path = outputPath;
+ this.mapOnlyJob = mapOnlyJob;
+ this.multiPaths = Maps.newHashMap();
+ }
+
+ public void configureNode(DoNode node, Target target) {
+ workingNode = node;
+ target.accept(this, node.getPType());
+ }
+
+ public boolean configure(Target target, PType<?> ptype) {
+ if (target instanceof MapReduceTarget) {
+ if (target instanceof PathTarget) {
+ multiPaths.put(jobCount, (PathTarget) target);
+ }
+
+ String name = PlanningParameters.MULTI_OUTPUT_PREFIX + jobCount;
+ jobCount++;
+ workingNode.setOutputName(name);
+ ((MapReduceTarget) target).configureForMapReduce(job, ptype, path, name);
+ return true;
+ }
+
+ return false;
+ }
+
+ public boolean isMapOnlyJob() {
+ return mapOnlyJob;
+ }
+
+ public Map<Integer, PathTarget> getMultiPaths() {
+ return multiPaths;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/MSCRPlanner.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/MSCRPlanner.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/MSCRPlanner.java
new file mode 100644
index 0000000..3e1de38
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/MSCRPlanner.java
@@ -0,0 +1,378 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.plan;
+
+import java.io.IOException;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.TreeMap;
+
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.Target;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.impl.mr.collect.InputCollection;
+import org.apache.crunch.impl.mr.collect.PCollectionImpl;
+import org.apache.crunch.impl.mr.collect.PGroupedTableImpl;
+import org.apache.crunch.impl.mr.exec.MRExecutor;
+import org.apache.crunch.materialize.MaterializableIterable;
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Multimap;
+import com.google.common.collect.Sets;
+
+public class MSCRPlanner {
+
+ private final MRPipeline pipeline;
+ private final Map<PCollectionImpl<?>, Set<Target>> outputs;
+ private final Map<PCollectionImpl<?>, MaterializableIterable> toMaterialize;
+ private int lastJobID = 0;
+
+ public MSCRPlanner(MRPipeline pipeline, Map<PCollectionImpl<?>, Set<Target>> outputs,
+ Map<PCollectionImpl<?>, MaterializableIterable> toMaterialize) {
+ this.pipeline = pipeline;
+ this.outputs = new TreeMap<PCollectionImpl<?>, Set<Target>>(DEPTH_COMPARATOR);
+ this.outputs.putAll(outputs);
+ this.toMaterialize = toMaterialize;
+ }
+
+ // Used to ensure that we always build pipelines starting from the deepest
+ // outputs, which helps ensure that we handle intermediate outputs correctly.
+ private static final Comparator<PCollectionImpl<?>> DEPTH_COMPARATOR = new Comparator<PCollectionImpl<?>>() {
+ @Override
+ public int compare(PCollectionImpl<?> left, PCollectionImpl<?> right) {
+ int cmp = right.getDepth() - left.getDepth();
+ if (cmp == 0) {
+ // Ensure we don't throw away two output collections at the same depth.
+ // Using the collection name would be nicer here, but names aren't
+ // necessarily unique.
+ cmp = new Integer(right.hashCode()).compareTo(left.hashCode());
+ }
+ return cmp;
+ }
+ };
+
+ public MRExecutor plan(Class<?> jarClass, Configuration conf) throws IOException {
+ Map<PCollectionImpl<?>, Set<SourceTarget<?>>> targetDeps = Maps.newTreeMap(DEPTH_COMPARATOR);
+ for (PCollectionImpl<?> pcollect : outputs.keySet()) {
+ targetDeps.put(pcollect, pcollect.getTargetDependencies());
+ }
+
+ Multimap<Vertex, JobPrototype> assignments = HashMultimap.create();
+ Multimap<PCollectionImpl<?>, Vertex> protoDependency = HashMultimap.create();
+ while (!targetDeps.isEmpty()) {
+ Set<Target> allTargets = Sets.newHashSet();
+ for (PCollectionImpl<?> pcollect : targetDeps.keySet()) {
+ allTargets.addAll(outputs.get(pcollect));
+ }
+ GraphBuilder graphBuilder = new GraphBuilder();
+
+ // Walk the current plan tree and build a graph in which the vertices are
+ // sources, targets, and GBK operations.
+ Set<PCollectionImpl<?>> currentStage = Sets.newHashSet();
+ Set<PCollectionImpl<?>> laterStage = Sets.newHashSet();
+ for (PCollectionImpl<?> output : targetDeps.keySet()) {
+ if (Sets.intersection(allTargets, targetDeps.get(output)).isEmpty()) {
+ graphBuilder.visitOutput(output);
+ currentStage.add(output);
+ } else {
+ laterStage.add(output);
+ }
+ }
+
+ Graph baseGraph = graphBuilder.getGraph();
+
+ // Create a new graph that splits up up dependent GBK nodes.
+ Graph graph = prepareFinalGraph(baseGraph);
+
+ // Break the graph up into connected components.
+ List<List<Vertex>> components = graph.connectedComponents();
+
+ // For each component, we will create one or more job prototypes,
+ // depending on its profile.
+ // For dependency handling, we only need to care about which
+ // job prototype a particular GBK is assigned to.
+ for (List<Vertex> component : components) {
+ assignments.putAll(constructJobPrototypes(component));
+ }
+
+ // Add in the job dependency information here.
+ for (Map.Entry<Vertex, JobPrototype> e : assignments.entries()) {
+ JobPrototype current = e.getValue();
+ List<Vertex> parents = graph.getParents(e.getKey());
+ for (Vertex parent : parents) {
+ for (JobPrototype parentJobProto : assignments.get(parent)) {
+ current.addDependency(parentJobProto);
+ }
+ }
+ }
+
+ // Add cross-stage dependencies.
+ for (PCollectionImpl<?> output : currentStage) {
+ Set<Target> targets = outputs.get(output);
+ Vertex vertex = graph.getVertexAt(output);
+ for (PCollectionImpl<?> later : laterStage) {
+ if (!Sets.intersection(targets, targetDeps.get(later)).isEmpty()) {
+ protoDependency.put(later, vertex);
+ }
+ }
+ targetDeps.remove(output);
+ }
+ }
+
+ // Cross-job dependencies.
+ for (Entry<PCollectionImpl<?>, Vertex> pd : protoDependency.entries()) {
+ Vertex d = new Vertex(pd.getKey());
+ Vertex dj = pd.getValue();
+ for (JobPrototype parent : assignments.get(dj)) {
+ for (JobPrototype child : assignments.get(d)) {
+ child.addDependency(parent);
+ }
+ }
+ }
+
+ // Finally, construct the jobs from the prototypes and return.
+ DotfileWriter dotfileWriter = new DotfileWriter();
+ MRExecutor exec = new MRExecutor(jarClass, outputs, toMaterialize);
+ for (JobPrototype proto : Sets.newHashSet(assignments.values())) {
+ dotfileWriter.addJobPrototype(proto);
+ exec.addJob(proto.getCrunchJob(jarClass, conf, pipeline));
+ }
+
+ String planDotFile = dotfileWriter.buildDotfile();
+ exec.setPlanDotFile(planDotFile);
+ conf.set(PlanningParameters.PIPELINE_PLAN_DOTFILE, planDotFile);
+
+ return exec;
+ }
+
+ private Graph prepareFinalGraph(Graph baseGraph) {
+ Graph graph = new Graph();
+
+ for (Vertex baseVertex : baseGraph) {
+ // Add all of the vertices in the base graph, but no edges (yet).
+ graph.addVertex(baseVertex.getPCollection(), baseVertex.isOutput());
+ }
+
+ for (Edge e : baseGraph.getAllEdges()) {
+ // Add back all of the edges where neither vertex is a GBK and we do not
+ // have an output feeding into a GBK.
+ if (!(e.getHead().isGBK() && e.getTail().isGBK()) &&
+ !(e.getHead().isOutput() && e.getTail().isGBK())) {
+ Vertex head = graph.getVertexAt(e.getHead().getPCollection());
+ Vertex tail = graph.getVertexAt(e.getTail().getPCollection());
+ graph.getEdge(head, tail).addAllNodePaths(e.getNodePaths());
+ }
+ }
+
+ for (Vertex baseVertex : baseGraph) {
+ if (baseVertex.isGBK()) {
+ Vertex vertex = graph.getVertexAt(baseVertex.getPCollection());
+ for (Edge e : baseVertex.getIncomingEdges()) {
+ if (e.getHead().isOutput()) {
+ // Execute an edge split.
+ Vertex splitTail = e.getHead();
+ PCollectionImpl<?> split = splitTail.getPCollection();
+ InputCollection<?> inputNode = handleSplitTarget(split);
+ Vertex splitHead = graph.addVertex(inputNode, false);
+
+ // Divide up the node paths in the edge between the two GBK nodes so
+ // that each node is either owned by GBK1 -> newTail or newHead -> GBK2.
+ for (NodePath path : e.getNodePaths()) {
+ NodePath headPath = path.splitAt(split, splitHead.getPCollection());
+ graph.getEdge(vertex, splitTail).addNodePath(headPath);
+ graph.getEdge(splitHead, vertex).addNodePath(path);
+ }
+
+ // Note the dependency between the vertices in the graph.
+ graph.markDependency(splitHead, splitTail);
+ } else if (!e.getHead().isGBK()) {
+ Vertex newHead = graph.getVertexAt(e.getHead().getPCollection());
+ graph.getEdge(newHead, vertex).addAllNodePaths(e.getNodePaths());
+ }
+ }
+ for (Edge e : baseVertex.getOutgoingEdges()) {
+ if (!e.getTail().isGBK()) {
+ Vertex newTail = graph.getVertexAt(e.getTail().getPCollection());
+ graph.getEdge(vertex, newTail).addAllNodePaths(e.getNodePaths());
+ } else {
+ // Execute an Edge split
+ Vertex newGraphTail = graph.getVertexAt(e.getTail().getPCollection());
+ PCollectionImpl split = e.getSplit();
+ InputCollection<?> inputNode = handleSplitTarget(split);
+ Vertex splitTail = graph.addVertex(split, true);
+ Vertex splitHead = graph.addVertex(inputNode, false);
+
+ // Divide up the node paths in the edge between the two GBK nodes so
+ // that each node is either owned by GBK1 -> newTail or newHead -> GBK2.
+ for (NodePath path : e.getNodePaths()) {
+ NodePath headPath = path.splitAt(split, splitHead.getPCollection());
+ graph.getEdge(vertex, splitTail).addNodePath(headPath);
+ graph.getEdge(splitHead, newGraphTail).addNodePath(path);
+ }
+
+ // Note the dependency between the vertices in the graph.
+ graph.markDependency(splitHead, splitTail);
+ }
+ }
+ }
+ }
+
+ return graph;
+ }
+
+ private Multimap<Vertex, JobPrototype> constructJobPrototypes(List<Vertex> component) {
+ Multimap<Vertex, JobPrototype> assignment = HashMultimap.create();
+ List<Vertex> gbks = Lists.newArrayList();
+ for (Vertex v : component) {
+ if (v.isGBK()) {
+ gbks.add(v);
+ }
+ }
+
+ if (gbks.isEmpty()) {
+ HashMultimap<Target, NodePath> outputPaths = HashMultimap.create();
+ for (Vertex v : component) {
+ if (v.isInput()) {
+ for (Edge e : v.getOutgoingEdges()) {
+ for (NodePath nodePath : e.getNodePaths()) {
+ PCollectionImpl target = nodePath.tail();
+ for (Target t : outputs.get(target)) {
+ outputPaths.put(t, nodePath);
+ }
+ }
+ }
+ }
+ }
+ if (outputPaths.isEmpty()) {
+ throw new IllegalStateException("No outputs?");
+ }
+ JobPrototype prototype = JobPrototype.createMapOnlyJob(
+ ++lastJobID, outputPaths, pipeline.createTempPath());
+ for (Vertex v : component) {
+ assignment.put(v, prototype);
+ }
+ } else {
+ Set<Edge> usedEdges = Sets.newHashSet();
+ for (Vertex g : gbks) {
+ Set<NodePath> inputs = Sets.newHashSet();
+ for (Edge e : g.getIncomingEdges()) {
+ inputs.addAll(e.getNodePaths());
+ usedEdges.add(e);
+ }
+ JobPrototype prototype = JobPrototype.createMapReduceJob(
+ ++lastJobID, (PGroupedTableImpl) g.getPCollection(), inputs, pipeline.createTempPath());
+ assignment.put(g, prototype);
+ for (Edge e : g.getIncomingEdges()) {
+ assignment.put(e.getHead(), prototype);
+ usedEdges.add(e);
+ }
+ HashMultimap<Target, NodePath> outputPaths = HashMultimap.create();
+ for (Edge e : g.getOutgoingEdges()) {
+ Vertex output = e.getTail();
+ for (Target t : outputs.get(output.getPCollection())) {
+ outputPaths.putAll(t, e.getNodePaths());
+ }
+ assignment.put(output, prototype);
+ usedEdges.add(e);
+ }
+ prototype.addReducePaths(outputPaths);
+ }
+
+ // Check for any un-assigned vertices, which should be map-side outputs
+ // that we will need to run in a map-only job.
+ HashMultimap<Target, NodePath> outputPaths = HashMultimap.create();
+ Set<Vertex> orphans = Sets.newHashSet();
+ for (Vertex v : component) {
+
+ // Check if this vertex has multiple inputs but only a subset of
+ // them have already been assigned
+ boolean vertexHasUnassignedIncomingEdges = false;
+ if (v.isOutput()) {
+ for (Edge e : v.getIncomingEdges()) {
+ if (!usedEdges.contains(e)) {
+ vertexHasUnassignedIncomingEdges = true;
+ }
+ }
+ }
+
+ if (v.isOutput() && (vertexHasUnassignedIncomingEdges || !assignment.containsKey(v))) {
+ orphans.add(v);
+ for (Edge e : v.getIncomingEdges()) {
+ if (vertexHasUnassignedIncomingEdges && usedEdges.contains(e)) {
+ // We've already dealt with this incoming edge
+ continue;
+ }
+ orphans.add(e.getHead());
+ for (NodePath nodePath : e.getNodePaths()) {
+ PCollectionImpl target = nodePath.tail();
+ for (Target t : outputs.get(target)) {
+ outputPaths.put(t, nodePath);
+ }
+ }
+ }
+ }
+
+ }
+ if (!outputPaths.isEmpty()) {
+ JobPrototype prototype = JobPrototype.createMapOnlyJob(
+ ++lastJobID, outputPaths, pipeline.createTempPath());
+ for (Vertex orphan : orphans) {
+ assignment.put(orphan, prototype);
+ }
+ }
+ }
+
+ return assignment;
+ }
+
+ private InputCollection<?> handleSplitTarget(PCollectionImpl<?> splitTarget) {
+ if (!outputs.containsKey(splitTarget)) {
+ outputs.put(splitTarget, Sets.<Target> newHashSet());
+ }
+
+ SourceTarget srcTarget = null;
+ Target targetToReplace = null;
+ for (Target t : outputs.get(splitTarget)) {
+ if (t instanceof SourceTarget) {
+ srcTarget = (SourceTarget<?>) t;
+ break;
+ } else {
+ srcTarget = t.asSourceTarget(splitTarget.getPType());
+ if (srcTarget != null) {
+ targetToReplace = t;
+ break;
+ }
+ }
+ }
+ if (targetToReplace != null) {
+ outputs.get(splitTarget).remove(targetToReplace);
+ } else if (srcTarget == null) {
+ srcTarget = pipeline.createIntermediateOutput(splitTarget.getPType());
+ }
+ outputs.get(splitTarget).add(srcTarget);
+ splitTarget.materializeAt(srcTarget);
+
+ return (InputCollection<?>) pipeline.read(srcTarget);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/NodePath.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/NodePath.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/NodePath.java
new file mode 100644
index 0000000..a090d93
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/NodePath.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.plan;
+
+import java.util.Iterator;
+import java.util.LinkedList;
+
+import org.apache.crunch.impl.mr.collect.PCollectionImpl;
+
+import com.google.common.collect.Lists;
+
+class NodePath implements Iterable<PCollectionImpl<?>> {
+ private LinkedList<PCollectionImpl<?>> path;
+
+ public NodePath() {
+ this.path = Lists.newLinkedList();
+ }
+
+ public NodePath(PCollectionImpl<?> tail) {
+ this.path = Lists.newLinkedList();
+ this.path.add(tail);
+ }
+
+ public NodePath(NodePath other) {
+ this.path = Lists.newLinkedList(other.path);
+ }
+
+ public void push(PCollectionImpl<?> stage) {
+ this.path.push((PCollectionImpl<?>) stage);
+ }
+
+ public NodePath close(PCollectionImpl<?> head) {
+ this.path.push(head);
+ return this;
+ }
+
+ public Iterator<PCollectionImpl<?>> iterator() {
+ return path.iterator();
+ }
+
+ public Iterator<PCollectionImpl<?>> descendingIterator() {
+ return path.descendingIterator();
+ }
+
+ public PCollectionImpl<?> get(int index) {
+ return path.get(index);
+ }
+
+ public PCollectionImpl<?> head() {
+ return path.peekFirst();
+ }
+
+ public PCollectionImpl<?> tail() {
+ return path.peekLast();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == null || !(other instanceof NodePath)) {
+ return false;
+ }
+ NodePath nodePath = (NodePath) other;
+ return path.equals(nodePath.path);
+ }
+
+ @Override
+ public int hashCode() {
+ return 17 + 37 * path.hashCode();
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ for (PCollectionImpl<?> collect : path) {
+ sb.append(collect.getName() + "|");
+ }
+ sb.deleteCharAt(sb.length() - 1);
+ return sb.toString();
+ }
+
+ public NodePath splitAt(int splitIndex, PCollectionImpl<?> newHead) {
+ NodePath top = new NodePath();
+ for (int i = 0; i <= splitIndex; i++) {
+ top.path.add(path.get(i));
+ }
+ LinkedList<PCollectionImpl<?>> nextPath = Lists.newLinkedList();
+ nextPath.add(newHead);
+ nextPath.addAll(path.subList(splitIndex + 1, path.size()));
+ path = nextPath;
+ return top;
+ }
+
+ public NodePath splitAt(PCollectionImpl split, PCollectionImpl<?> newHead) {
+ NodePath top = new NodePath();
+ int splitIndex = 0;
+ for (PCollectionImpl p : path) {
+ top.path.add(p);
+ if (p == split) {
+ break;
+ }
+ splitIndex++;
+ }
+ LinkedList<PCollectionImpl<?>> nextPath = Lists.newLinkedList();
+ nextPath.add(newHead);
+ nextPath.addAll(path.subList(splitIndex + 1, path.size()));
+ path = nextPath;
+ return top;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/PlanningParameters.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/PlanningParameters.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/PlanningParameters.java
new file mode 100644
index 0000000..b90a911
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/PlanningParameters.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.plan;
+
+/**
+ * Collection of Configuration keys and various constants used when planning MapReduce jobs for a
+ * pipeline.
+ */
+public class PlanningParameters {
+
+ public static final String MULTI_OUTPUT_PREFIX = "out";
+
+ public static final String CRUNCH_WORKING_DIRECTORY = "crunch.work.dir";
+
+ /**
+ * Configuration key under which a <a href="http://www.graphviz.org">DOT</a> file containing the
+ * pipeline job graph is stored by the planner.
+ */
+ public static final String PIPELINE_PLAN_DOTFILE = "crunch.planner.dotfile";
+
+ private PlanningParameters() {
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/Vertex.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/Vertex.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/Vertex.java
new file mode 100644
index 0000000..f4aa668
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/plan/Vertex.java
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.plan;
+
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.lang.builder.ReflectionToStringBuilder;
+import org.apache.commons.lang.builder.ToStringStyle;
+import org.apache.crunch.Source;
+import org.apache.crunch.impl.mr.collect.InputCollection;
+import org.apache.crunch.impl.mr.collect.PCollectionImpl;
+import org.apache.crunch.impl.mr.collect.PGroupedTableImpl;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+/**
+ *
+ */
+class Vertex {
+ private final PCollectionImpl impl;
+
+ private boolean output;
+ private Set<Edge> incoming;
+ private Set<Edge> outgoing;
+
+ public Vertex(PCollectionImpl impl) {
+ this.impl = impl;
+ this.incoming = Sets.newHashSet();
+ this.outgoing = Sets.newHashSet();
+ }
+
+ public PCollectionImpl getPCollection() {
+ return impl;
+ }
+
+ public boolean isInput() {
+ return impl instanceof InputCollection;
+ }
+
+ public boolean isGBK() {
+ return impl instanceof PGroupedTableImpl;
+ }
+
+ public void setOutput() {
+ this.output = true;
+ }
+
+ public boolean isOutput() {
+ return output;
+ }
+
+ public Source getSource() {
+ if (isInput()) {
+ return ((InputCollection) impl).getSource();
+ }
+ return null;
+ }
+
+ public void addIncoming(Edge edge) {
+ this.incoming.add(edge);
+ }
+
+ public void addOutgoing(Edge edge) {
+ this.outgoing.add(edge);
+ }
+
+ public List<Vertex> getAllNeighbors() {
+ List<Vertex> n = Lists.newArrayList();
+ for (Edge e : incoming) {
+ n.add(e.getHead());
+ }
+ for (Edge e : outgoing) {
+ n.add(e.getTail());
+ }
+ return n;
+ }
+
+ public Set<Edge> getAllEdges() {
+ return Sets.union(incoming, outgoing);
+ }
+
+ public Set<Edge> getIncomingEdges() {
+ return incoming;
+ }
+
+ public Set<Edge> getOutgoingEdges() {
+ return outgoing;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null || !(obj instanceof Vertex)) {
+ return false;
+ }
+ Vertex other = (Vertex) obj;
+ return impl.equals(other.impl);
+ }
+
+ @Override
+ public int hashCode() {
+ return 17 + 37 * impl.hashCode();
+ }
+
+ @Override
+ public String toString() {
+ return new ReflectionToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE).setExcludeFieldNames(
+ new String[] { "outgoing", "incoming" }).toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchCombiner.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchCombiner.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchCombiner.java
new file mode 100644
index 0000000..47a3ded
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchCombiner.java
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.run;
+
+public class CrunchCombiner extends CrunchReducer {
+
+ @Override
+ protected NodeContext getNodeContext() {
+ return NodeContext.COMBINE;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputFormat.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputFormat.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputFormat.java
new file mode 100644
index 0000000..eb5dd8a
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputFormat.java
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.run;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.crunch.io.CrunchInputs;
+import org.apache.crunch.io.FormatBundle;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import com.google.common.collect.Lists;
+
+public class CrunchInputFormat<K, V> extends InputFormat<K, V> {
+
+ @Override
+ public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
+ List<InputSplit> splits = Lists.newArrayList();
+ Configuration base = job.getConfiguration();
+ Map<FormatBundle, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs.getFormatNodeMap(job);
+
+ // First, build a map of InputFormats to Paths
+ for (Map.Entry<FormatBundle, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) {
+ FormatBundle inputBundle = entry.getKey();
+ Configuration conf = new Configuration(base);
+ inputBundle.configure(conf);
+ Job jobCopy = new Job(conf);
+ InputFormat<?, ?> format = (InputFormat<?, ?>) ReflectionUtils.newInstance(inputBundle.getFormatClass(),
+ jobCopy.getConfiguration());
+ for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) {
+ Integer nodeIndex = nodeEntry.getKey();
+ List<Path> paths = nodeEntry.getValue();
+ FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));
+
+ // Get splits for each input path and tag with InputFormat
+ // and Mapper types by wrapping in a TaggedInputSplit.
+ List<InputSplit> pathSplits = format.getSplits(jobCopy);
+ for (InputSplit pathSplit : pathSplits) {
+ splits.add(new CrunchInputSplit(pathSplit, inputBundle.getFormatClass(),
+ nodeIndex, jobCopy.getConfiguration()));
+ }
+ }
+ }
+ return splits;
+ }
+
+ @Override
+ public RecordReader<K, V> createRecordReader(InputSplit inputSplit, TaskAttemptContext context) throws IOException,
+ InterruptedException {
+ return new CrunchRecordReader<K, V>(inputSplit, context);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputSplit.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputSplit.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputSplit.java
new file mode 100644
index 0000000..b41062b
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchInputSplit.java
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.run;
+
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.serializer.Deserializer;
+import org.apache.hadoop.io.serializer.SerializationFactory;
+import org.apache.hadoop.io.serializer.Serializer;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.util.ReflectionUtils;
+
+class CrunchInputSplit extends InputSplit implements Writable {
+
+ private InputSplit inputSplit;
+ private Class<? extends InputFormat<?, ?>> inputFormatClass;
+ private int nodeIndex;
+ private Configuration conf;
+
+ public CrunchInputSplit() {
+ // default constructor
+ }
+
+ public CrunchInputSplit(
+ InputSplit inputSplit,
+ Class<? extends InputFormat<?, ?>> inputFormatClass,
+ int nodeIndex,
+ Configuration conf) {
+ this.inputSplit = inputSplit;
+ this.inputFormatClass = inputFormatClass;
+ this.nodeIndex = nodeIndex;
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ public int getNodeIndex() {
+ return nodeIndex;
+ }
+
+ public InputSplit getInputSplit() {
+ return inputSplit;
+ }
+
+ public Class<? extends InputFormat<?, ?>> getInputFormatClass() {
+ return inputFormatClass;
+ }
+
+ @Override
+ public long getLength() throws IOException, InterruptedException {
+ return inputSplit.getLength();
+ }
+
+ @Override
+ public String[] getLocations() throws IOException, InterruptedException {
+ return inputSplit.getLocations();
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ nodeIndex = in.readInt();
+ conf = new Configuration();
+ conf.readFields(in);
+ inputFormatClass = (Class<? extends InputFormat<?, ?>>) readClass(in);
+ Class<? extends InputSplit> inputSplitClass = (Class<? extends InputSplit>) readClass(in);
+ inputSplit = (InputSplit) ReflectionUtils.newInstance(inputSplitClass, conf);
+ SerializationFactory factory = new SerializationFactory(conf);
+ Deserializer deserializer = factory.getDeserializer(inputSplitClass);
+ deserializer.open((DataInputStream) in);
+ inputSplit = (InputSplit) deserializer.deserialize(inputSplit);
+ }
+
+ private Class<?> readClass(DataInput in) throws IOException {
+ String className = Text.readString(in);
+ try {
+ return conf.getClassByName(className);
+ } catch (ClassNotFoundException e) {
+ throw new RuntimeException("readObject can't find class", e);
+ }
+ }
+
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(nodeIndex);
+ conf.write(out);
+ Text.writeString(out, inputFormatClass.getName());
+ Text.writeString(out, inputSplit.getClass().getName());
+ SerializationFactory factory = new SerializationFactory(conf);
+ Serializer serializer = factory.getSerializer(inputSplit.getClass());
+ serializer.open((DataOutputStream) out);
+ serializer.serialize(inputSplit);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchMapper.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchMapper.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchMapper.java
new file mode 100644
index 0000000..70f0b01
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchMapper.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.run;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.hadoop.mapreduce.Mapper;
+
+public class CrunchMapper extends Mapper<Object, Object, Object, Object> {
+
+ private static final Log LOG = LogFactory.getLog(CrunchMapper.class);
+
+ private RTNode node;
+ private CrunchTaskContext ctxt;
+ private boolean debug;
+
+ @Override
+ protected void setup(Mapper<Object, Object, Object, Object>.Context context) {
+ List<RTNode> nodes;
+ this.ctxt = new CrunchTaskContext(context, NodeContext.MAP);
+ try {
+ nodes = ctxt.getNodes();
+ } catch (IOException e) {
+ LOG.info("Crunch deserialization error", e);
+ throw new CrunchRuntimeException(e);
+ }
+ if (nodes.size() == 1) {
+ this.node = nodes.get(0);
+ } else {
+ CrunchInputSplit split = (CrunchInputSplit) context.getInputSplit();
+ this.node = nodes.get(split.getNodeIndex());
+ }
+ this.debug = ctxt.isDebugRun();
+ }
+
+ @Override
+ protected void map(Object k, Object v, Mapper<Object, Object, Object, Object>.Context context) {
+ if (debug) {
+ try {
+ node.process(k, v);
+ } catch (Exception e) {
+ LOG.error("Mapper exception", e);
+ }
+ } else {
+ node.process(k, v);
+ }
+ }
+
+ @Override
+ protected void cleanup(Mapper<Object, Object, Object, Object>.Context context) {
+ node.cleanup();
+ ctxt.cleanup();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchRecordReader.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchRecordReader.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchRecordReader.java
new file mode 100644
index 0000000..fc8fb32
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchRecordReader.java
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.run;
+
+import java.io.IOException;
+
+import org.apache.crunch.hadoop.mapreduce.TaskAttemptContextFactory;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.util.ReflectionUtils;
+
+class CrunchRecordReader<K, V> extends RecordReader<K, V> {
+
+ private final RecordReader<K, V> delegate;
+
+ public CrunchRecordReader(InputSplit inputSplit, final TaskAttemptContext context) throws IOException,
+ InterruptedException {
+ CrunchInputSplit crunchSplit = (CrunchInputSplit) inputSplit;
+ InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils.newInstance(crunchSplit.getInputFormatClass(),
+ crunchSplit.getConf());
+ this.delegate = inputFormat.createRecordReader(crunchSplit.getInputSplit(),
+ TaskAttemptContextFactory.create(crunchSplit.getConf(), context.getTaskAttemptID()));
+ }
+
+ @Override
+ public void close() throws IOException {
+ delegate.close();
+ }
+
+ @Override
+ public K getCurrentKey() throws IOException, InterruptedException {
+ return delegate.getCurrentKey();
+ }
+
+ @Override
+ public V getCurrentValue() throws IOException, InterruptedException {
+ return delegate.getCurrentValue();
+ }
+
+ @Override
+ public float getProgress() throws IOException, InterruptedException {
+ return delegate.getProgress();
+ }
+
+ @Override
+ public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
+ CrunchInputSplit crunchSplit = (CrunchInputSplit) inputSplit;
+ InputSplit delegateSplit = crunchSplit.getInputSplit();
+ delegate.initialize(delegateSplit,
+ TaskAttemptContextFactory.create(crunchSplit.getConf(), context.getTaskAttemptID()));
+ }
+
+ @Override
+ public boolean nextKeyValue() throws IOException, InterruptedException {
+ return delegate.nextKeyValue();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchReducer.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchReducer.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchReducer.java
new file mode 100644
index 0000000..e5ddbd2
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchReducer.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.run;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.impl.SingleUseIterable;
+import org.apache.hadoop.mapreduce.Reducer;
+
+public class CrunchReducer extends Reducer<Object, Object, Object, Object> {
+
+ private static final Log LOG = LogFactory.getLog(CrunchReducer.class);
+
+ private RTNode node;
+ private CrunchTaskContext ctxt;
+ private boolean debug;
+
+ protected NodeContext getNodeContext() {
+ return NodeContext.REDUCE;
+ }
+
+ @Override
+ protected void setup(Reducer<Object, Object, Object, Object>.Context context) {
+ this.ctxt = new CrunchTaskContext(context, getNodeContext());
+ try {
+ List<RTNode> nodes = ctxt.getNodes();
+ this.node = nodes.get(0);
+ } catch (IOException e) {
+ LOG.info("Crunch deserialization error", e);
+ throw new CrunchRuntimeException(e);
+ }
+ this.debug = ctxt.isDebugRun();
+ }
+
+ @Override
+ protected void reduce(Object key, Iterable<Object> values, Reducer<Object, Object, Object, Object>.Context context) {
+ values = new SingleUseIterable<Object>(values);
+ if (debug) {
+ try {
+ node.processIterable(key, values);
+ } catch (Exception e) {
+ LOG.error("Reducer exception", e);
+ }
+ } else {
+ node.processIterable(key, values);
+ }
+ }
+
+ @Override
+ protected void cleanup(Reducer<Object, Object, Object, Object>.Context context) {
+ node.cleanup();
+ ctxt.cleanup();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchTaskContext.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchTaskContext.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchTaskContext.java
new file mode 100644
index 0000000..c4f2873
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/CrunchTaskContext.java
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.run;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.impl.mr.plan.PlanningParameters;
+import org.apache.crunch.io.CrunchOutputs;
+import org.apache.crunch.util.DistCache;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+
+class CrunchTaskContext {
+
+ private final TaskInputOutputContext<Object, Object, Object, Object> taskContext;
+ private final NodeContext nodeContext;
+ private CrunchOutputs<Object, Object> multipleOutputs;
+
+ public CrunchTaskContext(TaskInputOutputContext<Object, Object, Object, Object> taskContext, NodeContext nodeContext) {
+ this.taskContext = taskContext;
+ this.nodeContext = nodeContext;
+ }
+
+ public TaskInputOutputContext<Object, Object, Object, Object> getContext() {
+ return taskContext;
+ }
+
+ public NodeContext getNodeContext() {
+ return nodeContext;
+ }
+
+ public List<RTNode> getNodes() throws IOException {
+ Configuration conf = taskContext.getConfiguration();
+ Path path = new Path(new Path(conf.get(PlanningParameters.CRUNCH_WORKING_DIRECTORY)), nodeContext.toString());
+ @SuppressWarnings("unchecked")
+ List<RTNode> nodes = (List<RTNode>) DistCache.read(conf, path);
+ if (nodes != null) {
+ for (RTNode node : nodes) {
+ node.initialize(this);
+ }
+ }
+ return nodes;
+ }
+
+ public boolean isDebugRun() {
+ Configuration conf = taskContext.getConfiguration();
+ return conf.getBoolean(RuntimeParameters.DEBUG, false);
+ }
+
+ public void cleanup() {
+ if (multipleOutputs != null) {
+ try {
+ multipleOutputs.close();
+ } catch (IOException e) {
+ throw new CrunchRuntimeException(e);
+ } catch (InterruptedException e) {
+ throw new CrunchRuntimeException(e);
+ }
+ }
+ }
+
+ public CrunchOutputs<Object, Object> getMultipleOutputs() {
+ if (multipleOutputs == null) {
+ multipleOutputs = new CrunchOutputs<Object, Object>(taskContext);
+ }
+ return multipleOutputs;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/NodeContext.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/NodeContext.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/NodeContext.java
new file mode 100644
index 0000000..ffc9e7c
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/NodeContext.java
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.run;
+
+import org.apache.crunch.impl.mr.plan.DoNode;
+
+/**
+ * Enum that is associated with a serialized {@link DoNode} instance, so we know
+ * how to use it within the context of a particular MR job.
+ *
+ */
+public enum NodeContext {
+ MAP,
+ REDUCE,
+ COMBINE;
+
+ public String getConfigurationKey() {
+ return "crunch.donode." + toString().toLowerCase();
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/RTNode.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/RTNode.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/RTNode.java
new file mode 100644
index 0000000..ce7b795
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/RTNode.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.run;
+
+import java.io.Serializable;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.impl.mr.emit.IntermediateEmitter;
+import org.apache.crunch.impl.mr.emit.MultipleOutputEmitter;
+import org.apache.crunch.impl.mr.emit.OutputEmitter;
+import org.apache.crunch.types.Converter;
+import org.apache.crunch.types.PType;
+
+public class RTNode implements Serializable {
+
+ private static final Log LOG = LogFactory.getLog(RTNode.class);
+
+ private final String nodeName;
+ private DoFn<Object, Object> fn;
+ private PType<Object> outputPType;
+ private final List<RTNode> children;
+ private final Converter inputConverter;
+ private final Converter outputConverter;
+ private final String outputName;
+
+ private transient Emitter<Object> emitter;
+
+ public RTNode(DoFn<Object, Object> fn, PType<Object> outputPType, String name, List<RTNode> children,
+ Converter inputConverter,
+ Converter outputConverter, String outputName) {
+ this.fn = fn;
+ this.outputPType = outputPType;
+ this.nodeName = name;
+ this.children = children;
+ this.inputConverter = inputConverter;
+ this.outputConverter = outputConverter;
+ this.outputName = outputName;
+ }
+
+ public void initialize(CrunchTaskContext ctxt) {
+ if (emitter != null) {
+ // Already initialized
+ return;
+ }
+
+ fn.setContext(ctxt.getContext());
+ fn.initialize();
+ for (RTNode child : children) {
+ child.initialize(ctxt);
+ }
+
+ if (outputConverter != null) {
+ if (outputName != null) {
+ this.emitter = new MultipleOutputEmitter(outputConverter, ctxt.getMultipleOutputs(),
+ outputName);
+ } else {
+ this.emitter = new OutputEmitter(outputConverter, ctxt.getContext());
+ }
+ } else if (!children.isEmpty()) {
+ this.emitter = new IntermediateEmitter(outputPType, children,
+ ctxt.getContext().getConfiguration());
+ } else {
+ throw new CrunchRuntimeException("Invalid RTNode config: no emitter for: " + nodeName);
+ }
+ }
+
+ public boolean isLeafNode() {
+ return outputConverter != null && children.isEmpty();
+ }
+
+ public void process(Object input) {
+ try {
+ fn.process(input, emitter);
+ } catch (CrunchRuntimeException e) {
+ if (!e.wasLogged()) {
+ LOG.info(String.format("Crunch exception in '%s' for input: %s", nodeName, input.toString()), e);
+ e.markLogged();
+ }
+ throw e;
+ }
+ }
+
+ public void process(Object key, Object value) {
+ process(inputConverter.convertInput(key, value));
+ }
+
+ public void processIterable(Object key, Iterable values) {
+ process(inputConverter.convertIterableInput(key, values));
+ }
+
+ public void cleanup() {
+ fn.cleanup(emitter);
+ emitter.flush();
+ for (RTNode child : children) {
+ child.cleanup();
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "RTNode [nodeName=" + nodeName + ", fn=" + fn + ", children=" + children + ", inputConverter="
+ + inputConverter + ", outputConverter=" + outputConverter + ", outputName=" + outputName + "]";
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/RuntimeParameters.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/RuntimeParameters.java b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/RuntimeParameters.java
new file mode 100644
index 0000000..604c49c
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/impl/mr/run/RuntimeParameters.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.impl.mr.run;
+
+/**
+ * Parameters used during the runtime execution.
+ */
+public class RuntimeParameters {
+
+ public static final String AGGREGATOR_BUCKETS = "crunch.aggregator.buckets";
+
+ public static final String DEBUG = "crunch.debug";
+
+ public static final String TMP_DIR = "crunch.tmp.dir";
+
+ public static final String LOG_JOB_PROGRESS = "crunch.log.job.progress";
+
+ public static final String CREATE_DIR = "mapreduce.jobcontrol.createdir.ifnotexist";
+
+ // Not instantiated
+ private RuntimeParameters() {
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/At.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/At.java b/crunch-core/src/main/java/org/apache/crunch/io/At.java
new file mode 100644
index 0000000..a6f0782
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/At.java
@@ -0,0 +1,257 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import org.apache.avro.specific.SpecificRecord;
+import org.apache.crunch.SourceTarget;
+import org.apache.crunch.TableSourceTarget;
+import org.apache.crunch.io.avro.AvroFileSourceTarget;
+import org.apache.crunch.io.seq.SeqFileSourceTarget;
+import org.apache.crunch.io.seq.SeqFileTableSourceTarget;
+import org.apache.crunch.io.text.TextFileSourceTarget;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.avro.AvroType;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * <p>Static factory methods for creating common {@link SourceTarget} types, which may be treated as both a {@code Source}
+ * and a {@code Target}.</p>
+ *
+ * <p>The {@code At} methods is analogous to the {@link From} and {@link To} factory methods, but is used for
+ * storing intermediate outputs that need to be passed from one run of a MapReduce pipeline to another run. The
+ * {@code SourceTarget} object acts as both a {@code Source} and a {@Target}, which enables it to provide this
+ * functionality.
+ *
+ * <code>
+ * Pipeline pipeline = new MRPipeline(this.getClass());
+ * // Create our intermediate storage location
+ * SourceTarget<String> intermediate = At.textFile("/temptext");
+ * ...
+ * // Write out the output of the first phase of a pipeline.
+ * pipeline.write(phase1, intermediate);
+ *
+ * // Explicitly call run to kick off the pipeline.
+ * pipeline.run();
+ *
+ * // And then kick off a second phase by consuming the output
+ * // from the first phase.
+ * PCollection<String> phase2Input = pipeline.read(intermediate);
+ * ...
+ * </code>
+ * </p>
+ *
+ * <p>The {@code SourceTarget} abstraction is useful when we care about reading the intermediate
+ * outputs of a pipeline as well as the final results.</p>
+ */
+public class At {
+
+ /**
+ * Creates a {@code SourceTarget<T>} instance from the Avro file(s) at the given path name.
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param avroClass The subclass of {@code SpecificRecord} to use for the Avro file
+ * @return A new {@code SourceTarget<T>} instance
+ */
+ public static <T extends SpecificRecord> SourceTarget<T> avroFile(String pathName, Class<T> avroClass) {
+ return avroFile(new Path(pathName), avroClass);
+ }
+
+ /**
+ * Creates a {@code SourceTarget<T>} instance from the Avro file(s) at the given {@code Path}.
+ *
+ * @param path The {@code Path} to the data
+ * @param avroClass The subclass of {@code SpecificRecord} to use for the Avro file
+ * @return A new {@code SourceTarget<T>} instance
+ */
+ public static <T extends SpecificRecord> SourceTarget<T> avroFile(Path path, Class<T> avroClass) {
+ return avroFile(path, Avros.specifics(avroClass));
+ }
+
+ /**
+ * Creates a {@code SourceTarget<T>} instance from the Avro file(s) at the given path name.
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param avroType The {@code AvroType} for the Avro records
+ * @return A new {@code SourceTarget<T>} instance
+ */
+ public static <T> SourceTarget<T> avroFile(String pathName, AvroType<T> avroType) {
+ return avroFile(new Path(pathName), avroType);
+ }
+
+ /**
+ * Creates a {@code SourceTarget<T>} instance from the Avro file(s) at the given {@code Path}.
+ *
+ * @param path The {@code Path} to the data
+ * @param avroType The {@code AvroType} for the Avro records
+ * @return A new {@code SourceTarget<T>} instance
+ */
+ public static <T> SourceTarget<T> avroFile(Path path, AvroType<T> avroType) {
+ return new AvroFileSourceTarget<T>(path, avroType);
+ }
+
+ /**
+ * Creates a {@code SourceTarget<T>} instance from the SequenceFile(s) at the given path name
+ * from the value field of each key-value pair in the SequenceFile(s).
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param valueClass The {@code Writable} type for the value of the SequenceFile entry
+ * @return A new {@code SourceTarget<T>} instance
+ */
+ public static <T extends Writable> SourceTarget<T> sequenceFile(String pathName, Class<T> valueClass) {
+ return sequenceFile(new Path(pathName), valueClass);
+ }
+
+ /**
+ * Creates a {@code SourceTarget<T>} instance from the SequenceFile(s) at the given {@code Path}
+ * from the value field of each key-value pair in the SequenceFile(s).
+ *
+ * @param path The {@code Path} to the data
+ * @param valueClass The {@code Writable} type for the value of the SequenceFile entry
+ * @return A new {@code SourceTarget<T>} instance
+ */
+ public static <T extends Writable> SourceTarget<T> sequenceFile(Path path, Class<T> valueClass) {
+ return sequenceFile(path, Writables.writables(valueClass));
+ }
+
+ /**
+ * Creates a {@code SourceTarget<T>} instance from the SequenceFile(s) at the given path name
+ * from the value field of each key-value pair in the SequenceFile(s).
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param ptype The {@code PType} for the value of the SequenceFile entry
+ * @return A new {@code SourceTarget<T>} instance
+ */
+ public static <T> SourceTarget<T> sequenceFile(String pathName, PType<T> ptype) {
+ return sequenceFile(new Path(pathName), ptype);
+ }
+
+ /**
+ * Creates a {@code SourceTarget<T>} instance from the SequenceFile(s) at the given {@code Path}
+ * from the value field of each key-value pair in the SequenceFile(s).
+ *
+ * @param path The {@code Path} to the data
+ * @param ptype The {@code PType} for the value of the SequenceFile entry
+ * @return A new {@code SourceTarget<T>} instance
+ */
+ public static <T> SourceTarget<T> sequenceFile(Path path, PType<T> ptype) {
+ return new SeqFileSourceTarget<T>(path, ptype);
+ }
+
+ /**
+ * Creates a {@code TableSourceTarget<K, V>} instance from the SequenceFile(s) at the given path name
+ * from the key-value pairs in the SequenceFile(s).
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param keyClass The {@code Writable} type for the key of the SequenceFile entry
+ * @param valueClass The {@code Writable} type for the value of the SequenceFile entry
+ * @return A new {@code TableSourceTarget<K, V>} instance
+ */
+ public static <K extends Writable, V extends Writable> TableSourceTarget<K, V> sequenceFile(
+ String pathName, Class<K> keyClass, Class<V> valueClass) {
+ return sequenceFile(new Path(pathName), keyClass, valueClass);
+ }
+
+ /**
+ * Creates a {@code TableSourceTarget<K, V>} instance from the SequenceFile(s) at the given {@code Path}
+ * from the key-value pairs in the SequenceFile(s).
+ *
+ * @param path The {@code Path} to the data
+ * @param keyClass The {@code Writable} type for the key of the SequenceFile entry
+ * @param valueClass The {@code Writable} type for the value of the SequenceFile entry
+ * @return A new {@code TableSourceTarget<K, V>} instance
+ */
+ public static <K extends Writable, V extends Writable> TableSourceTarget<K, V> sequenceFile(
+ Path path, Class<K> keyClass, Class<V> valueClass) {
+ return sequenceFile(path, Writables.writables(keyClass), Writables.writables(valueClass));
+ }
+
+ /**
+ * Creates a {@code TableSourceTarget<K, V>} instance from the SequenceFile(s) at the given path name
+ * from the key-value pairs in the SequenceFile(s).
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param keyType The {@code PType} for the key of the SequenceFile entry
+ * @param valueType The {@code PType} for the value of the SequenceFile entry
+ * @return A new {@code TableSourceTarget<K, V>} instance
+ */
+ public static <K, V> TableSourceTarget<K, V> sequenceFile(String pathName, PType<K> keyType, PType<V> valueType) {
+ return sequenceFile(new Path(pathName), keyType, valueType);
+ }
+
+ /**
+ * Creates a {@code TableSourceTarget<K, V>} instance from the SequenceFile(s) at the given {@code Path}
+ * from the key-value pairs in the SequenceFile(s).
+ *
+ * @param path The {@code Path} to the data
+ * @param keyType The {@code PType} for the key of the SequenceFile entry
+ * @param valueType The {@code PType} for the value of the SequenceFile entry
+ * @return A new {@code TableSourceTarget<K, V>} instance
+ */
+ public static <K, V> TableSourceTarget<K, V> sequenceFile(Path path, PType<K> keyType, PType<V> valueType) {
+ PTypeFamily ptf = keyType.getFamily();
+ return new SeqFileTableSourceTarget<K, V>(path, ptf.tableOf(keyType, valueType));
+ }
+
+ /**
+ * Creates a {@code SourceTarget<String>} instance for the text file(s) at the given path name.
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @return A new {@code SourceTarget<String>} instance
+ */
+ public static SourceTarget<String> textFile(String pathName) {
+ return textFile(new Path(pathName));
+ }
+
+ /**
+ * Creates a {@code SourceTarget<String>} instance for the text file(s) at the given {@code Path}.
+ *
+ * @param path The {@code Path} to the data
+ * @return A new {@code SourceTarget<String>} instance
+ */
+ public static SourceTarget<String> textFile(Path path) {
+ return textFile(path, Writables.strings());
+ }
+
+ /**
+ * Creates a {@code SourceTarget<T>} instance for the text file(s) at the given path name using
+ * the provided {@code PType<T>} to convert the input text.
+ *
+ * @param pathName The name of the path to the data on the filesystem
+ * @param ptype The {@code PType<T>} to use to process the input text
+ * @return A new {@code SourceTarget<T>} instance
+ */
+ public static <T> SourceTarget<T> textFile(String pathName, PType<T> ptype) {
+ return textFile(new Path(pathName), ptype);
+ }
+
+ /**
+ * Creates a {@code SourceTarget<T>} instance for the text file(s) at the given {@code Path} using
+ * the provided {@code PType<T>} to convert the input text.
+ *
+ * @param path The {@code Path} to the data
+ * @param ptype The {@code PType<T>} to use to process the input text
+ * @return A new {@code SourceTarget<T>} instance
+ */
+ public static <T> SourceTarget<T> textFile(Path path, PType<T> ptype) {
+ return new TextFileSourceTarget<T>(path, ptype);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/CompositePathIterable.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/CompositePathIterable.java b/crunch-core/src/main/java/org/apache/crunch/io/CompositePathIterable.java
new file mode 100644
index 0000000..a4723e9
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/CompositePathIterable.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Iterator;
+
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+
+import com.google.common.collect.UnmodifiableIterator;
+
+public class CompositePathIterable<T> implements Iterable<T> {
+
+ private final FileStatus[] stati;
+ private final FileSystem fs;
+ private final FileReaderFactory<T> readerFactory;
+
+ private static final PathFilter FILTER = new PathFilter() {
+ @Override
+ public boolean accept(Path path) {
+ return !path.getName().startsWith("_");
+ }
+ };
+
+ public static <S> Iterable<S> create(FileSystem fs, Path path, FileReaderFactory<S> readerFactory) throws IOException {
+
+ if (!fs.exists(path)) {
+ throw new IOException("No files found to materialize at: " + path);
+ }
+
+ FileStatus[] stati = null;
+ try {
+ stati = fs.listStatus(path, FILTER);
+ } catch (FileNotFoundException e) {
+ stati = null;
+ }
+ if (stati == null) {
+ throw new IOException("No files found to materialize at: " + path);
+ }
+
+ if (stati.length == 0) {
+ return Collections.emptyList();
+ } else {
+ return new CompositePathIterable<S>(stati, fs, readerFactory);
+ }
+
+ }
+
+ private CompositePathIterable(FileStatus[] stati, FileSystem fs, FileReaderFactory<T> readerFactory) {
+ this.stati = stati;
+ this.fs = fs;
+ this.readerFactory = readerFactory;
+ }
+
+ @Override
+ public Iterator<T> iterator() {
+
+ return new UnmodifiableIterator<T>() {
+ private int index = 0;
+ private Iterator<T> iter = readerFactory.read(fs, stati[index++].getPath());
+
+ @Override
+ public boolean hasNext() {
+ if (!iter.hasNext()) {
+ while (index < stati.length) {
+ iter = readerFactory.read(fs, stati[index++].getPath());
+ if (iter.hasNext()) {
+ return true;
+ }
+ }
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public T next() {
+ return iter.next();
+ }
+ };
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/io/CrunchInputs.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/io/CrunchInputs.java b/crunch-core/src/main/java/org/apache/crunch/io/CrunchInputs.java
new file mode 100644
index 0000000..d154db2
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/io/CrunchInputs.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.io;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+/**
+ * Helper functions for configuring multiple {@code InputFormat} instances within a single
+ * Crunch MapReduce job.
+ */
+public class CrunchInputs {
+ public static final String CRUNCH_INPUTS = "crunch.inputs.dir";
+
+ private static final char RECORD_SEP = ',';
+ private static final char FIELD_SEP = ';';
+ private static final Joiner JOINER = Joiner.on(FIELD_SEP);
+ private static final Splitter SPLITTER = Splitter.on(FIELD_SEP);
+
+ public static void addInputPath(Job job, Path path, FormatBundle inputBundle, int nodeIndex) {
+ Configuration conf = job.getConfiguration();
+ String inputs = JOINER.join(inputBundle.serialize(), String.valueOf(nodeIndex), path.toString());
+ String existing = conf.get(CRUNCH_INPUTS);
+ conf.set(CRUNCH_INPUTS, existing == null ? inputs : existing + RECORD_SEP + inputs);
+ }
+
+ public static Map<FormatBundle, Map<Integer, List<Path>>> getFormatNodeMap(JobContext job) {
+ Map<FormatBundle, Map<Integer, List<Path>>> formatNodeMap = Maps.newHashMap();
+ Configuration conf = job.getConfiguration();
+ for (String input : Splitter.on(RECORD_SEP).split(conf.get(CRUNCH_INPUTS))) {
+ List<String> fields = Lists.newArrayList(SPLITTER.split(input));
+ FormatBundle<InputFormat> inputBundle = FormatBundle.fromSerialized(fields.get(0), InputFormat.class);
+ if (!formatNodeMap.containsKey(inputBundle)) {
+ formatNodeMap.put(inputBundle, Maps.<Integer, List<Path>> newHashMap());
+ }
+ Integer nodeIndex = Integer.valueOf(fields.get(1));
+ if (!formatNodeMap.get(inputBundle).containsKey(nodeIndex)) {
+ formatNodeMap.get(inputBundle).put(nodeIndex, Lists.<Path> newLinkedList());
+ }
+ formatNodeMap.get(inputBundle).get(nodeIndex).add(new Path(fields.get(2)));
+ }
+ return formatNodeMap;
+ }
+
+}
[27/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/PTypeUtils.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/PTypeUtils.java b/crunch-core/src/main/java/org/apache/crunch/types/PTypeUtils.java
new file mode 100644
index 0000000..e61b98b
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/PTypeUtils.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.Tuple;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+
+/**
+ * Utilities for converting between {@code PType}s from different
+ * {@code PTypeFamily} implementations.
+ *
+ */
+public class PTypeUtils {
+
+ public static <T> PType<T> convert(PType<T> ptype, PTypeFamily tf) {
+ if (ptype instanceof PTableType) {
+ PTableType ptt = (PTableType) ptype;
+ return tf.tableOf(tf.as(ptt.getKeyType()), tf.as(ptt.getValueType()));
+ }
+ Class<T> typeClass = ptype.getTypeClass();
+ if (Tuple.class.isAssignableFrom(typeClass)) {
+ List<PType> subTypes = ptype.getSubTypes();
+ if (Pair.class.equals(typeClass)) {
+ return tf.pairs(tf.as(subTypes.get(0)), tf.as(subTypes.get(1)));
+ } else if (Tuple3.class.equals(typeClass)) {
+ return tf.triples(tf.as(subTypes.get(0)), tf.as(subTypes.get(1)), tf.as(subTypes.get(2)));
+ } else if (Tuple4.class.equals(typeClass)) {
+ return tf.quads(tf.as(subTypes.get(0)), tf.as(subTypes.get(1)), tf.as(subTypes.get(2)), tf.as(subTypes.get(3)));
+ } else if (TupleN.class.equals(typeClass)) {
+ PType[] newPTypes = subTypes.toArray(new PType[0]);
+ for (int i = 0; i < newPTypes.length; i++) {
+ newPTypes[i] = tf.as(subTypes.get(i));
+ }
+ return (PType<T>) tf.tuples(newPTypes);
+ }
+ }
+ if (Collection.class.isAssignableFrom(typeClass)) {
+ return tf.collections(tf.as(ptype.getSubTypes().get(0)));
+ }
+ return tf.records(typeClass);
+ }
+
+ private PTypeUtils() {
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/PTypes.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/PTypes.java b/crunch-core/src/main/java/org/apache/crunch/types/PTypes.java
new file mode 100644
index 0000000..546719c
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/PTypes.java
@@ -0,0 +1,252 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.util.UUID;
+
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.MapFn;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.thrift.TBase;
+import org.apache.thrift.TDeserializer;
+import org.apache.thrift.TException;
+import org.apache.thrift.TSerializer;
+import org.apache.thrift.protocol.TBinaryProtocol;
+import org.codehaus.jackson.map.ObjectMapper;
+
+import com.google.protobuf.InvalidProtocolBufferException;
+import com.google.protobuf.Message;
+
+/**
+ * Utility functions for creating common types of derived PTypes, e.g., for JSON
+ * data, protocol buffers, and Thrift records.
+ *
+ */
+public class PTypes {
+
+ public static PType<BigInteger> bigInt(PTypeFamily typeFamily) {
+ return typeFamily.derived(BigInteger.class, BYTE_TO_BIGINT, BIGINT_TO_BYTE, typeFamily.bytes());
+ }
+
+ public static PType<UUID> uuid(PTypeFamily ptf) {
+ return ptf.derived(UUID.class, BYTE_TO_UUID, UUID_TO_BYTE, ptf.bytes());
+ }
+
+ public static <T> PType<T> jsonString(Class<T> clazz, PTypeFamily typeFamily) {
+ return typeFamily
+ .derived(clazz, new JacksonInputMapFn<T>(clazz), new JacksonOutputMapFn<T>(), typeFamily.strings());
+ }
+
+ public static <T extends Message> PType<T> protos(Class<T> clazz, PTypeFamily typeFamily) {
+ return typeFamily.derived(clazz, new ProtoInputMapFn<T>(clazz), new ProtoOutputMapFn<T>(), typeFamily.bytes());
+ }
+
+ public static <T extends TBase> PType<T> thrifts(Class<T> clazz, PTypeFamily typeFamily) {
+ return typeFamily.derived(clazz, new ThriftInputMapFn<T>(clazz), new ThriftOutputMapFn<T>(), typeFamily.bytes());
+ }
+
+ public static final <T extends Enum> PType<T> enums(final Class<T> type, PTypeFamily typeFamily) {
+ return typeFamily.derived(type, new EnumInputMapper<T>(type), new EnumOutputMapper<T>(), typeFamily.strings());
+ }
+
+ public static MapFn<ByteBuffer, BigInteger> BYTE_TO_BIGINT = new MapFn<ByteBuffer, BigInteger>() {
+ public BigInteger map(ByteBuffer input) {
+ return input == null ? null : new BigInteger(input.array());
+ }
+ };
+
+ public static MapFn<BigInteger, ByteBuffer> BIGINT_TO_BYTE = new MapFn<BigInteger, ByteBuffer>() {
+ public ByteBuffer map(BigInteger input) {
+ return input == null ? null : ByteBuffer.wrap(input.toByteArray());
+ }
+ };
+
+ private static class JacksonInputMapFn<T> extends MapFn<String, T> {
+
+ private final Class<T> clazz;
+ private transient ObjectMapper mapper;
+
+ public JacksonInputMapFn(Class<T> clazz) {
+ this.clazz = clazz;
+ }
+
+ @Override
+ public void initialize() {
+ this.mapper = new ObjectMapper();
+ }
+
+ @Override
+ public T map(String input) {
+ try {
+ return mapper.readValue(input, clazz);
+ } catch (Exception e) {
+ throw new CrunchRuntimeException(e);
+ }
+ }
+ }
+
+ private static class JacksonOutputMapFn<T> extends MapFn<T, String> {
+
+ private transient ObjectMapper mapper;
+
+ @Override
+ public void initialize() {
+ this.mapper = new ObjectMapper();
+ }
+
+ @Override
+ public String map(T input) {
+ try {
+ return mapper.writeValueAsString(input);
+ } catch (Exception e) {
+ throw new CrunchRuntimeException(e);
+ }
+ }
+ }
+
+ private static class ProtoInputMapFn<T extends Message> extends MapFn<ByteBuffer, T> {
+
+ private final Class<T> clazz;
+ private transient T instance;
+
+ public ProtoInputMapFn(Class<T> clazz) {
+ this.clazz = clazz;
+ }
+
+ @Override
+ public void initialize() {
+ this.instance = Protos.getDefaultInstance(clazz);
+ }
+
+ @Override
+ public T map(ByteBuffer bb) {
+ try {
+ return (T) instance.newBuilderForType().mergeFrom(bb.array(), bb.position(), bb.limit()).build();
+ } catch (InvalidProtocolBufferException e) {
+ throw new CrunchRuntimeException(e);
+ }
+ }
+ }
+
+ private static class ProtoOutputMapFn<T extends Message> extends MapFn<T, ByteBuffer> {
+
+ public ProtoOutputMapFn() {
+ }
+
+ @Override
+ public ByteBuffer map(T proto) {
+ return ByteBuffer.wrap(proto.toByteArray());
+ }
+ }
+
+ private static class ThriftInputMapFn<T extends TBase> extends MapFn<ByteBuffer, T> {
+
+ private final Class<T> clazz;
+ private transient T instance;
+ private transient TDeserializer deserializer;
+ private transient byte[] bytes;
+
+ public ThriftInputMapFn(Class<T> clazz) {
+ this.clazz = clazz;
+ }
+
+ @Override
+ public void initialize() {
+ this.instance = ReflectionUtils.newInstance(clazz, null);
+ this.deserializer = new TDeserializer(new TBinaryProtocol.Factory());
+ this.bytes = new byte[0];
+ }
+
+ @Override
+ public T map(ByteBuffer bb) {
+ T next = (T) instance.deepCopy();
+ int len = bb.limit() - bb.position();
+ if (len != bytes.length) {
+ bytes = new byte[len];
+ }
+ System.arraycopy(bb.array(), bb.position(), bytes, 0, len);
+ try {
+ deserializer.deserialize(next, bytes);
+ } catch (TException e) {
+ throw new CrunchRuntimeException(e);
+ }
+ return next;
+ }
+ }
+
+ private static class ThriftOutputMapFn<T extends TBase> extends MapFn<T, ByteBuffer> {
+
+ private transient TSerializer serializer;
+
+ public ThriftOutputMapFn() {
+ }
+
+ @Override
+ public void initialize() {
+ this.serializer = new TSerializer(new TBinaryProtocol.Factory());
+ }
+
+ @Override
+ public ByteBuffer map(T t) {
+ try {
+ return ByteBuffer.wrap(serializer.serialize(t));
+ } catch (TException e) {
+ throw new CrunchRuntimeException(e);
+ }
+ }
+ }
+
+ private static class EnumInputMapper<T extends Enum> extends MapFn<String, T> {
+ private final Class<T> type;
+
+ public EnumInputMapper(Class<T> type) {
+ this.type = type;
+ }
+
+ @Override
+ public T map(String input) {
+ return (T) Enum.valueOf(type, input);
+ }
+ };
+
+ private static class EnumOutputMapper<T extends Enum> extends MapFn<T, String> {
+
+ @Override
+ public String map(T input) {
+ return input.name();
+ }
+ };
+
+ private static MapFn<ByteBuffer, UUID> BYTE_TO_UUID = new MapFn<ByteBuffer, UUID>() {
+ @Override
+ public UUID map(ByteBuffer input) {
+ return new UUID(input.getLong(), input.getLong());
+ }
+ };
+
+ private static MapFn<UUID, ByteBuffer> UUID_TO_BYTE = new MapFn<UUID, ByteBuffer>() {
+ @Override
+ public ByteBuffer map(UUID input) {
+ ByteBuffer bb = ByteBuffer.wrap(new byte[16]);
+ bb.asLongBuffer().put(input.getMostSignificantBits()).put(input.getLeastSignificantBits());
+ return bb;
+ }
+ };
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/Protos.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/Protos.java b/crunch-core/src/main/java/org/apache/crunch/types/Protos.java
new file mode 100644
index 0000000..4cd5068
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/Protos.java
@@ -0,0 +1,173 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.MapFn;
+import org.apache.hadoop.util.ReflectionUtils;
+
+import com.google.common.base.Splitter;
+import com.google.protobuf.Descriptors.FieldDescriptor;
+import com.google.protobuf.Message;
+import com.google.protobuf.Message.Builder;
+
+/**
+ * Utility functions for working with protocol buffers in Crunch.
+ */
+public class Protos {
+
+ /**
+ * Utility function for creating a default PB Messgae from a Class object that
+ * works with both protoc 2.3.0 and 2.4.x.
+ * @param clazz The class of the protocol buffer to create
+ * @return An instance of a protocol buffer
+ */
+ public static <M extends Message> M getDefaultInstance(Class<M> clazz) {
+ if (clazz.getConstructors().length > 0) {
+ // Protobuf 2.3.0
+ return ReflectionUtils.newInstance(clazz, null);
+ } else {
+ // Protobuf 2.4.x
+ try {
+ Message.Builder mb = (Message.Builder) clazz.getDeclaredMethod("newBuilder").invoke(null);
+ return (M) mb.getDefaultInstanceForType();
+ } catch (Exception e) {
+ throw new CrunchRuntimeException(e);
+ }
+ }
+ }
+
+ public static <M extends Message, K> MapFn<M, K> extractKey(String fieldName) {
+ return new ExtractKeyFn<M, K>(fieldName);
+ }
+
+ public static <M extends Message> DoFn<String, M> lineParser(String sep, Class<M> msgClass) {
+ return new TextToProtoFn<M>(sep, msgClass);
+ }
+
+ private static class ExtractKeyFn<M extends Message, K> extends MapFn<M, K> {
+
+ private final String fieldName;
+
+ private transient FieldDescriptor fd;
+
+ public ExtractKeyFn(String fieldName) {
+ this.fieldName = fieldName;
+ }
+
+ @Override
+ public K map(M input) {
+ if (input == null) {
+ throw new IllegalArgumentException("Null inputs not supported by Protos.ExtractKeyFn");
+ } else if (fd == null) {
+ fd = input.getDescriptorForType().findFieldByName(fieldName);
+ if (fd == null) {
+ throw new IllegalStateException("Could not find field: " + fieldName + " in message: " + input);
+ }
+ }
+ return (K) input.getField(fd);
+ }
+
+ }
+
+ private static class TextToProtoFn<M extends Message> extends DoFn<String, M> {
+
+ private final String sep;
+ private final Class<M> msgClass;
+
+ private transient M msgInstance;
+ private transient List<FieldDescriptor> fields;
+ private transient Splitter splitter;
+
+ enum ParseErrors {
+ TOTAL,
+ NUMBER_FORMAT
+ };
+
+ public TextToProtoFn(String sep, Class<M> msgClass) {
+ this.sep = sep;
+ this.msgClass = msgClass;
+ }
+
+ @Override
+ public void initialize() {
+ this.msgInstance = getDefaultInstance(msgClass);
+ this.fields = msgInstance.getDescriptorForType().getFields();
+ this.splitter = Splitter.on(sep);
+ }
+
+ @Override
+ public void process(String input, Emitter<M> emitter) {
+ if (input != null && !input.isEmpty()) {
+ Builder b = msgInstance.newBuilderForType();
+ Iterator<String> iter = splitter.split(input).iterator();
+ boolean parseError = false;
+ for (FieldDescriptor fd : fields) {
+ if (iter.hasNext()) {
+ String value = iter.next();
+ if (value != null && !value.isEmpty()) {
+ Object parsedValue = null;
+ try {
+ switch (fd.getJavaType()) {
+ case STRING:
+ parsedValue = value;
+ break;
+ case INT:
+ parsedValue = Integer.valueOf(value);
+ break;
+ case LONG:
+ parsedValue = Long.valueOf(value);
+ break;
+ case FLOAT:
+ parsedValue = Float.valueOf(value);
+ break;
+ case DOUBLE:
+ parsedValue = Double.valueOf(value);
+ break;
+ case BOOLEAN:
+ parsedValue = Boolean.valueOf(value);
+ break;
+ case ENUM:
+ parsedValue = fd.getEnumType().findValueByName(value);
+ break;
+ }
+ b.setField(fd, parsedValue);
+ } catch (NumberFormatException nfe) {
+ increment(ParseErrors.NUMBER_FORMAT);
+ parseError = true;
+ break;
+ }
+ }
+ }
+ }
+
+ if (parseError) {
+ increment(ParseErrors.TOTAL);
+ } else {
+ emitter.emit((M) b.build());
+ }
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/TupleDeepCopier.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/TupleDeepCopier.java b/crunch-core/src/main/java/org/apache/crunch/types/TupleDeepCopier.java
new file mode 100644
index 0000000..a2ffae3
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/TupleDeepCopier.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import java.util.List;
+
+import org.apache.crunch.Tuple;
+import org.apache.hadoop.conf.Configuration;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Performs deep copies (based on underlying PType deep copying) of Tuple-based objects.
+ *
+ * @param <T> The type of Tuple implementation being copied
+ */
+public class TupleDeepCopier<T extends Tuple> implements DeepCopier<T> {
+
+ private final TupleFactory<T> tupleFactory;
+ private final List<PType> elementTypes;
+
+ public TupleDeepCopier(Class<T> tupleClass, PType... elementTypes) {
+ tupleFactory = TupleFactory.getTupleFactory(tupleClass);
+ this.elementTypes = Lists.newArrayList(elementTypes);
+ }
+
+ @Override
+ public void initialize(Configuration conf) {
+ for (PType elementType : elementTypes) {
+ elementType.initialize(conf);
+ }
+ }
+
+ @Override
+ public T deepCopy(T source) {
+
+ if (source == null) {
+ return null;
+ }
+
+ Object[] deepCopyValues = new Object[source.size()];
+
+ for (int valueIndex = 0; valueIndex < elementTypes.size(); valueIndex++) {
+ PType elementType = elementTypes.get(valueIndex);
+ deepCopyValues[valueIndex] = elementType.getDetachedValue(source.get(valueIndex));
+ }
+
+ return tupleFactory.makeTuple(deepCopyValues);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/TupleFactory.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/TupleFactory.java b/crunch-core/src/main/java/org/apache/crunch/types/TupleFactory.java
new file mode 100644
index 0000000..73b47de
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/TupleFactory.java
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types;
+
+import java.io.Serializable;
+import java.lang.reflect.Constructor;
+import java.util.Map;
+
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Tuple;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+
+import com.google.common.collect.Maps;
+
+public abstract class TupleFactory<T extends Tuple> implements Serializable {
+
+ public void initialize() {
+ }
+
+ public abstract T makeTuple(Object... values);
+
+
+ private static final Map<Class, TupleFactory> customTupleFactories = Maps.newHashMap();
+
+ /**
+ * Get the {@link TupleFactory} for a given Tuple implementation.
+ *
+ * @param tupleClass
+ * The class for which the factory is to be retrieved
+ * @return The appropriate TupleFactory
+ */
+ public static <T extends Tuple> TupleFactory<T> getTupleFactory(Class<T> tupleClass) {
+ if (tupleClass == Pair.class) {
+ return (TupleFactory<T>) PAIR;
+ } else if (tupleClass == Tuple3.class) {
+ return (TupleFactory<T>) TUPLE3;
+ } else if (tupleClass == Tuple4.class) {
+ return (TupleFactory<T>) TUPLE4;
+ } else if (tupleClass == TupleN.class) {
+ return (TupleFactory<T>) TUPLEN;
+ } else if (customTupleFactories.containsKey(tupleClass)) {
+ return (TupleFactory<T>) customTupleFactories.get(tupleClass);
+ } else {
+ throw new IllegalArgumentException("Can't create TupleFactory for " + tupleClass);
+ }
+ }
+
+ public static final TupleFactory<Pair> PAIR = new TupleFactory<Pair>() {
+ @Override
+ public Pair makeTuple(Object... values) {
+ return Pair.of(values[0], values[1]);
+ }
+ };
+
+ public static final TupleFactory<Tuple3> TUPLE3 = new TupleFactory<Tuple3>() {
+ @Override
+ public Tuple3 makeTuple(Object... values) {
+ return Tuple3.of(values[0], values[1], values[2]);
+ }
+ };
+
+ public static final TupleFactory<Tuple4> TUPLE4 = new TupleFactory<Tuple4>() {
+ @Override
+ public Tuple4 makeTuple(Object... values) {
+ return Tuple4.of(values[0], values[1], values[2], values[3]);
+ }
+ };
+
+ public static final TupleFactory<TupleN> TUPLEN = new TupleFactory<TupleN>() {
+ @Override
+ public TupleN makeTuple(Object... values) {
+ return new TupleN(values);
+ }
+ };
+
+ public static <T extends Tuple> TupleFactory<T> create(Class<T> clazz, Class... typeArgs) {
+ if (customTupleFactories.containsKey(clazz)) {
+ return (TupleFactory<T>) customTupleFactories.get(clazz);
+ }
+ TupleFactory<T> custom = new CustomTupleFactory<T>(clazz, typeArgs);
+ customTupleFactories.put(clazz, custom);
+ return custom;
+ }
+
+ private static class CustomTupleFactory<T extends Tuple> extends TupleFactory<T> {
+
+ private final Class<T> clazz;
+ private final Class[] typeArgs;
+
+ private transient Constructor<T> constructor;
+
+ public CustomTupleFactory(Class<T> clazz, Class[] typeArgs) {
+ this.clazz = clazz;
+ this.typeArgs = typeArgs;
+ }
+
+ @Override
+ public void initialize() {
+ try {
+ constructor = clazz.getConstructor(typeArgs);
+ } catch (Exception e) {
+ throw new CrunchRuntimeException(e);
+ }
+ }
+
+ @Override
+ public T makeTuple(Object... values) {
+ try {
+ return constructor.newInstance(values);
+ } catch (Exception e) {
+ throw new CrunchRuntimeException(e);
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroCapabilities.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroCapabilities.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroCapabilities.java
new file mode 100644
index 0000000..cc1636c
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroCapabilities.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.apache.avro.Schema;
+import org.apache.avro.io.BinaryDecoder;
+import org.apache.avro.io.BinaryEncoder;
+import org.apache.avro.io.DecoderFactory;
+import org.apache.avro.io.EncoderFactory;
+import org.apache.avro.reflect.ReflectDatumReader;
+import org.apache.avro.reflect.ReflectDatumWriter;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Determines the capabilities of the Avro version that is currently being used.
+ */
+class AvroCapabilities {
+
+ public static class Record extends org.apache.avro.specific.SpecificRecordBase implements
+ org.apache.avro.specific.SpecificRecord {
+ public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser()
+ .parse("{\"type\":\"record\",\"name\":\"Record\",\"namespace\":\"org.apache.crunch.types.avro\",\"fields\":[{\"name\":\"subrecords\",\"type\":{\"type\":\"array\",\"items\":\"string\"}}]}");
+ @Deprecated
+ public java.util.List<java.lang.CharSequence> subrecords;
+
+ public java.lang.Object get(int field$) {
+ switch (field$) {
+ case 0:
+ return subrecords;
+ default:
+ throw new org.apache.avro.AvroRuntimeException("Bad index");
+ }
+ }
+
+ // Used by DatumReader. Applications should not call.
+ @SuppressWarnings(value = "unchecked")
+ public void put(int field$, java.lang.Object value$) {
+ switch (field$) {
+ case 0:
+ subrecords = (java.util.List<java.lang.CharSequence>) value$;
+ break;
+ default:
+ throw new org.apache.avro.AvroRuntimeException("Bad index");
+ }
+ }
+
+ @Override
+ public Schema getSchema() {
+ return SCHEMA$;
+ }
+ }
+
+ /**
+ * Determine if the current Avro version can use the ReflectDatumReader to
+ * read SpecificData that includes an array. The inability to do this was a
+ * bug that was fixed in Avro 1.7.0.
+ *
+ * @return true if SpecificData can be properly read using a
+ * ReflectDatumReader
+ */
+ static boolean canDecodeSpecificSchemaWithReflectDatumReader() {
+ ReflectDatumReader<Record> datumReader = new ReflectDatumReader(Record.SCHEMA$);
+ ReflectDatumWriter<Record> datumWriter = new ReflectDatumWriter(Record.SCHEMA$);
+
+ Record record = new Record();
+ record.subrecords = Lists.<CharSequence> newArrayList("a", "b");
+
+ ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
+ BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(byteArrayOutputStream, null);
+
+ try {
+ datumWriter.write(record, encoder);
+ encoder.flush();
+ BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(
+ byteArrayOutputStream.toByteArray(), null);
+ datumReader.read(record, decoder);
+ } catch (IOException ioe) {
+ throw new RuntimeException("Error performing specific schema test", ioe);
+ } catch (ClassCastException cce) {
+ // This indicates that we're using a pre-1.7.0 version of Avro, as the
+ // ReflectDatumReader in those versions could not correctly handle an
+ // array in a SpecificData value
+ return false;
+ }
+ return true;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroDeepCopier.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroDeepCopier.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroDeepCopier.java
new file mode 100644
index 0000000..0fe9288
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroDeepCopier.java
@@ -0,0 +1,209 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import java.io.ByteArrayOutputStream;
+import java.io.Serializable;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericData.Record;
+import org.apache.avro.generic.GenericDatumReader;
+import org.apache.avro.generic.GenericDatumWriter;
+import org.apache.avro.io.BinaryDecoder;
+import org.apache.avro.io.BinaryEncoder;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.io.DatumWriter;
+import org.apache.avro.io.DecoderFactory;
+import org.apache.avro.io.EncoderFactory;
+import org.apache.avro.specific.SpecificDatumReader;
+import org.apache.avro.specific.SpecificDatumWriter;
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.crunch.types.DeepCopier;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Performs deep copies of Avro-serializable objects.
+ * <p>
+ * <b>Warning:</b> Methods in this class are not thread-safe. This shouldn't be a problem when
+ * running in a map-reduce context where each mapper/reducer is running in its own JVM, but it may
+ * well be a problem in any other kind of multi-threaded context.
+ */
+abstract class AvroDeepCopier<T> implements DeepCopier<T>, Serializable {
+
+ private String jsonSchema;
+ private transient Configuration conf;
+ private transient Schema schema;
+ private BinaryEncoder binaryEncoder;
+ private BinaryDecoder binaryDecoder;
+
+ private transient DatumWriter<T> datumWriter;
+ private transient DatumReader<T> datumReader;
+
+ public AvroDeepCopier(Schema schema) {
+ this.jsonSchema = schema.toString();
+ }
+
+ protected Schema getSchema() {
+ if (schema == null) {
+ schema = new Schema.Parser().parse(jsonSchema);
+ }
+ return schema;
+ }
+
+ @Override
+ public void initialize(Configuration conf) {
+ this.conf = conf;
+ }
+
+ protected abstract T createCopyTarget();
+
+ protected abstract DatumWriter<T> createDatumWriter(Configuration conf);
+
+ protected abstract DatumReader<T> createDatumReader(Configuration conf);
+
+ /**
+ * Deep copier for Avro specific data objects.
+ */
+ public static class AvroSpecificDeepCopier<T> extends AvroDeepCopier<T> {
+
+ private Class<T> valueClass;
+
+ public AvroSpecificDeepCopier(Class<T> valueClass, Schema schema) {
+ super(schema);
+ this.valueClass = valueClass;
+ }
+
+ @Override
+ protected T createCopyTarget() {
+ return createNewInstance(valueClass);
+ }
+
+ @Override
+ protected DatumWriter<T> createDatumWriter(Configuration conf) {
+ return new SpecificDatumWriter<T>(getSchema());
+ }
+
+ @Override
+ protected DatumReader<T> createDatumReader(Configuration conf) {
+ return new SpecificDatumReader<T>(getSchema());
+ }
+
+ }
+
+ /**
+ * Deep copier for Avro generic data objects.
+ */
+ public static class AvroGenericDeepCopier extends AvroDeepCopier<Record> {
+
+ private transient Schema schema;
+
+ public AvroGenericDeepCopier(Schema schema) {
+ super(schema);
+ }
+
+ @Override
+ protected Record createCopyTarget() {
+ return new GenericData.Record(getSchema());
+ }
+
+ @Override
+ protected DatumReader<Record> createDatumReader(Configuration conf) {
+ return new GenericDatumReader<Record>(getSchema());
+ }
+
+ @Override
+ protected DatumWriter<Record> createDatumWriter(Configuration conf) {
+ return new GenericDatumWriter<Record>(getSchema());
+ }
+ }
+
+ /**
+ * Deep copier for Avro reflect data objects.
+ */
+ public static class AvroReflectDeepCopier<T> extends AvroDeepCopier<T> {
+
+ private Class<T> valueClass;
+
+ public AvroReflectDeepCopier(Class<T> valueClass, Schema schema) {
+ super(schema);
+ this.valueClass = valueClass;
+ }
+
+ @Override
+ protected T createCopyTarget() {
+ return createNewInstance(valueClass);
+ }
+
+ @Override
+ protected DatumReader<T> createDatumReader(Configuration conf) {
+ return Avros.getReflectDataFactory(conf).getReader(getSchema());
+ }
+
+ @Override
+ protected DatumWriter<T> createDatumWriter(Configuration conf) {
+ return Avros.getReflectDataFactory(conf).getWriter(getSchema());
+ }
+ }
+
+ /**
+ * Create a deep copy of an Avro value.
+ *
+ * @param source The value to be copied
+ * @return The deep copy of the value
+ */
+ @Override
+ public T deepCopy(T source) {
+
+ if (source == null) {
+ return null;
+ }
+
+ if (datumReader == null) {
+ datumReader = createDatumReader(conf);
+ }
+ if (datumWriter == null) {
+ datumWriter = createDatumWriter(conf);
+ }
+ ByteArrayOutputStream byteOutStream = new ByteArrayOutputStream();
+ binaryEncoder = EncoderFactory.get().binaryEncoder(byteOutStream, binaryEncoder);
+ T target = createCopyTarget();
+ try {
+ datumWriter.write(source, binaryEncoder);
+ binaryEncoder.flush();
+ binaryDecoder = DecoderFactory.get()
+ .binaryDecoder(byteOutStream.toByteArray(), binaryDecoder);
+ datumReader.read(target, binaryDecoder);
+ } catch (Exception e) {
+ throw new CrunchRuntimeException("Error while deep copying avro value " + source, e);
+ }
+
+ return target;
+ }
+
+ protected T createNewInstance(Class<T> targetClass) {
+ try {
+ return targetClass.newInstance();
+ } catch (InstantiationException e) {
+ throw new CrunchRuntimeException(e);
+ } catch (IllegalAccessException e) {
+ throw new CrunchRuntimeException(e);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroGroupedTableType.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroGroupedTableType.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroGroupedTableType.java
new file mode 100644
index 0000000..598868f
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroGroupedTableType.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import java.util.Collection;
+
+import org.apache.avro.mapred.AvroJob;
+import org.apache.avro.mapred.AvroKey;
+import org.apache.avro.mapred.AvroKeyComparator;
+import org.apache.avro.mapred.AvroValue;
+import org.apache.crunch.GroupingOptions;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Pair;
+import org.apache.crunch.fn.PairMapFn;
+import org.apache.crunch.lib.PTables;
+import org.apache.crunch.types.Converter;
+import org.apache.crunch.types.PGroupedTableType;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.Job;
+
+/**
+ *
+ *
+ */
+class AvroGroupedTableType<K, V> extends PGroupedTableType<K, V> {
+
+ private static final AvroPairConverter CONVERTER = new AvroPairConverter();
+ private final MapFn inputFn;
+ private final MapFn outputFn;
+
+ public AvroGroupedTableType(AvroTableType<K, V> tableType) {
+ super(tableType);
+ AvroType keyType = (AvroType) tableType.getKeyType();
+ AvroType valueType = (AvroType) tableType.getValueType();
+ this.inputFn = new PairIterableMapFn(keyType.getInputMapFn(), valueType.getInputMapFn());
+ this.outputFn = new PairMapFn(keyType.getOutputMapFn(), valueType.getOutputMapFn());
+ }
+
+ @Override
+ public Class<Pair<K, Iterable<V>>> getTypeClass() {
+ return (Class<Pair<K, Iterable<V>>>) Pair.of(null, null).getClass();
+ }
+
+ @Override
+ public Converter getGroupingConverter() {
+ return CONVERTER;
+ }
+
+ @Override
+ public MapFn getInputMapFn() {
+ return inputFn;
+ }
+
+ @Override
+ public MapFn getOutputMapFn() {
+ return outputFn;
+ }
+
+ @Override
+ public void initialize(Configuration conf) {
+ getTableType().initialize(conf);
+ }
+
+ @Override
+ public Pair<K, Iterable<V>> getDetachedValue(Pair<K, Iterable<V>> value) {
+ return PTables.getGroupedDetachedValue(this, value);
+ }
+
+ @Override
+ public void configureShuffle(Job job, GroupingOptions options) {
+ AvroTableType<K, V> att = (AvroTableType<K, V>) tableType;
+ String schemaJson = att.getSchema().toString();
+ Configuration conf = job.getConfiguration();
+
+ if (att.hasReflect()) {
+ if (att.hasSpecific()) {
+ Avros.checkCombiningSpecificAndReflectionSchemas();
+ }
+ conf.setBoolean(AvroJob.MAP_OUTPUT_IS_REFLECT, true);
+ }
+ conf.set(AvroJob.MAP_OUTPUT_SCHEMA, schemaJson);
+ job.setSortComparatorClass(AvroKeyComparator.class);
+ job.setMapOutputKeyClass(AvroKey.class);
+ job.setMapOutputValueClass(AvroValue.class);
+ if (options != null) {
+ options.configure(job);
+ }
+
+ Avros.configureReflectDataFactory(conf);
+
+ Collection<String> serializations = job.getConfiguration().getStringCollection(
+ "io.serializations");
+ if (!serializations.contains(SafeAvroSerialization.class.getName())) {
+ serializations.add(SafeAvroSerialization.class.getName());
+ job.getConfiguration().setStrings("io.serializations", serializations.toArray(new String[0]));
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroInputFormat.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroInputFormat.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroInputFormat.java
new file mode 100644
index 0000000..b8bbebd
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroInputFormat.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import java.io.IOException;
+
+import org.apache.avro.Schema;
+import org.apache.avro.mapred.AvroJob;
+import org.apache.avro.mapred.AvroWrapper;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+
+/** An {@link org.apache.hadoop.mapreduce.InputFormat} for Avro data files. */
+public class AvroInputFormat<T> extends FileInputFormat<AvroWrapper<T>, NullWritable> {
+ @Override
+ public RecordReader<AvroWrapper<T>, NullWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
+ throws IOException, InterruptedException {
+ context.setStatus(split.toString());
+ String jsonSchema = context.getConfiguration().get(AvroJob.INPUT_SCHEMA);
+ Schema schema = new Schema.Parser().parse(jsonSchema);
+ return new AvroRecordReader<T>(schema);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroKeyConverter.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroKeyConverter.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroKeyConverter.java
new file mode 100644
index 0000000..68b717d
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroKeyConverter.java
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import org.apache.avro.mapred.AvroWrapper;
+import org.apache.crunch.types.Converter;
+import org.apache.hadoop.io.NullWritable;
+
+class AvroKeyConverter<K> implements Converter<AvroWrapper<K>, NullWritable, K, Iterable<K>> {
+
+ private transient AvroWrapper<K> wrapper = null;
+
+ @Override
+ public K convertInput(AvroWrapper<K> key, NullWritable value) {
+ return key.datum();
+ }
+
+ @Override
+ public AvroWrapper<K> outputKey(K value) {
+ getWrapper().datum(value);
+ return wrapper;
+ }
+
+ @Override
+ public NullWritable outputValue(K value) {
+ return NullWritable.get();
+ }
+
+ @Override
+ public Class<AvroWrapper<K>> getKeyClass() {
+ return (Class<AvroWrapper<K>>) getWrapper().getClass();
+ }
+
+ @Override
+ public Class<NullWritable> getValueClass() {
+ return NullWritable.class;
+ }
+
+ private AvroWrapper<K> getWrapper() {
+ if (wrapper == null) {
+ wrapper = new AvroWrapper<K>();
+ }
+ return wrapper;
+ }
+
+ @Override
+ public Iterable<K> convertIterableInput(AvroWrapper<K> key, Iterable<NullWritable> value) {
+ throw new UnsupportedOperationException("Should not be possible");
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroOutputFormat.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroOutputFormat.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroOutputFormat.java
new file mode 100644
index 0000000..98d3f50
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroOutputFormat.java
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import java.io.IOException;
+
+import org.apache.avro.Schema;
+import org.apache.avro.file.CodecFactory;
+import org.apache.avro.file.DataFileWriter;
+import org.apache.avro.mapred.AvroJob;
+import org.apache.avro.mapred.AvroWrapper;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+
+/** An {@link org.apache.hadoop.mapreduce.OutputFormat} for Avro data files. */
+public class AvroOutputFormat<T> extends FileOutputFormat<AvroWrapper<T>, NullWritable> {
+
+ @Override
+ public RecordWriter<AvroWrapper<T>, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException,
+ InterruptedException {
+
+ Configuration conf = context.getConfiguration();
+ Schema schema = null;
+ String outputName = conf.get("crunch.namedoutput");
+ if (outputName != null && !outputName.isEmpty()) {
+ schema = (new Schema.Parser()).parse(conf.get("avro.output.schema." + outputName));
+ } else {
+ schema = AvroJob.getOutputSchema(context.getConfiguration());
+ }
+
+ ReflectDataFactory factory = Avros.getReflectDataFactory(conf);
+ final DataFileWriter<T> WRITER = new DataFileWriter<T>(factory.<T> getWriter(schema));
+
+ JobConf jc = new JobConf(conf);
+ /* copied from org.apache.avro.mapred.AvroOutputFormat */
+
+ if (org.apache.hadoop.mapred.FileOutputFormat.getCompressOutput(jc)) {
+ int level = conf.getInt(org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY,
+ org.apache.avro.mapred.AvroOutputFormat.DEFAULT_DEFLATE_LEVEL);
+ String codecName = conf.get(AvroJob.OUTPUT_CODEC,
+ org.apache.avro.file.DataFileConstants.DEFLATE_CODEC);
+ CodecFactory codec = codecName.equals(org.apache.avro.file.DataFileConstants.DEFLATE_CODEC)
+ ? CodecFactory.deflateCodec(level)
+ : CodecFactory.fromString(codecName);
+ WRITER.setCodec(codec);
+ }
+
+ WRITER.setSyncInterval(jc.getInt(org.apache.avro.mapred.AvroOutputFormat.SYNC_INTERVAL_KEY,
+ org.apache.avro.file.DataFileConstants.DEFAULT_SYNC_INTERVAL));
+
+ Path path = getDefaultWorkFile(context, org.apache.avro.mapred.AvroOutputFormat.EXT);
+ WRITER.create(schema, path.getFileSystem(context.getConfiguration()).create(path));
+
+ return new RecordWriter<AvroWrapper<T>, NullWritable>() {
+ @Override
+ public void write(AvroWrapper<T> wrapper, NullWritable ignore) throws IOException {
+ WRITER.append(wrapper.datum());
+ }
+
+ @Override
+ public void close(TaskAttemptContext context) throws IOException, InterruptedException {
+ WRITER.close();
+ }
+ };
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroPairConverter.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroPairConverter.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroPairConverter.java
new file mode 100644
index 0000000..d1d2627
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroPairConverter.java
@@ -0,0 +1,108 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import java.util.Iterator;
+
+import org.apache.avro.mapred.AvroKey;
+import org.apache.avro.mapred.AvroValue;
+import org.apache.crunch.Pair;
+import org.apache.crunch.types.Converter;
+
+class AvroPairConverter<K, V> implements Converter<AvroKey<K>, AvroValue<V>, Pair<K, V>, Pair<K, Iterable<V>>> {
+
+ private transient AvroKey<K> keyWrapper = null;
+ private transient AvroValue<V> valueWrapper = null;
+
+ @Override
+ public Pair<K, V> convertInput(AvroKey<K> key, AvroValue<V> value) {
+ return Pair.of(key.datum(), value.datum());
+ }
+
+ public Pair<K, Iterable<V>> convertIterableInput(AvroKey<K> key, Iterable<AvroValue<V>> iter) {
+ Iterable<V> it = new AvroWrappedIterable<V>(iter);
+ return Pair.of(key.datum(), it);
+ }
+
+ @Override
+ public AvroKey<K> outputKey(Pair<K, V> value) {
+ getKeyWrapper().datum(value.first());
+ return keyWrapper;
+ }
+
+ @Override
+ public AvroValue<V> outputValue(Pair<K, V> value) {
+ getValueWrapper().datum(value.second());
+ return valueWrapper;
+ }
+
+ @Override
+ public Class<AvroKey<K>> getKeyClass() {
+ return (Class<AvroKey<K>>) getKeyWrapper().getClass();
+ }
+
+ @Override
+ public Class<AvroValue<V>> getValueClass() {
+ return (Class<AvroValue<V>>) getValueWrapper().getClass();
+ }
+
+ private AvroKey<K> getKeyWrapper() {
+ if (keyWrapper == null) {
+ keyWrapper = new AvroKey<K>();
+ }
+ return keyWrapper;
+ }
+
+ private AvroValue<V> getValueWrapper() {
+ if (valueWrapper == null) {
+ valueWrapper = new AvroValue<V>();
+ }
+ return valueWrapper;
+ }
+
+ private static class AvroWrappedIterable<V> implements Iterable<V> {
+
+ private final Iterable<AvroValue<V>> iters;
+
+ public AvroWrappedIterable(Iterable<AvroValue<V>> iters) {
+ this.iters = iters;
+ }
+
+ @Override
+ public Iterator<V> iterator() {
+ return new Iterator<V>() {
+ private final Iterator<AvroValue<V>> it = iters.iterator();
+
+ @Override
+ public boolean hasNext() {
+ return it.hasNext();
+ }
+
+ @Override
+ public V next() {
+ return it.next().datum();
+ }
+
+ @Override
+ public void remove() {
+ it.remove();
+ }
+ };
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroRecordReader.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroRecordReader.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroRecordReader.java
new file mode 100644
index 0000000..9c7578c
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroRecordReader.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import java.io.IOException;
+
+import org.apache.avro.Schema;
+import org.apache.avro.file.DataFileReader;
+import org.apache.avro.file.FileReader;
+import org.apache.avro.file.SeekableInput;
+import org.apache.avro.io.DatumReader;
+import org.apache.avro.mapred.AvroJob;
+import org.apache.avro.mapred.AvroWrapper;
+import org.apache.avro.mapred.FsInput;
+import org.apache.avro.specific.SpecificDatumReader;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+
+/** An {@link RecordReader} for Avro data files. */
+class AvroRecordReader<T> extends RecordReader<AvroWrapper<T>, NullWritable> {
+
+ private FileReader<T> reader;
+ private long start;
+ private long end;
+ private AvroWrapper<T> key;
+ private NullWritable value;
+ private Schema schema;
+
+ public AvroRecordReader(Schema schema) {
+ this.schema = schema;
+ }
+
+ @Override
+ public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException {
+ FileSplit split = (FileSplit) genericSplit;
+ Configuration conf = context.getConfiguration();
+ SeekableInput in = new FsInput(split.getPath(), conf);
+ DatumReader<T> datumReader = null;
+ if (context.getConfiguration().getBoolean(AvroJob.INPUT_IS_REFLECT, true)) {
+ ReflectDataFactory factory = Avros.getReflectDataFactory(conf);
+ datumReader = factory.getReader(schema);
+ } else {
+ datumReader = new SpecificDatumReader<T>(schema);
+ }
+ this.reader = DataFileReader.openReader(in, datumReader);
+ reader.sync(split.getStart()); // sync to start
+ this.start = reader.tell();
+ this.end = split.getStart() + split.getLength();
+ }
+
+ @Override
+ public boolean nextKeyValue() throws IOException, InterruptedException {
+ if (!reader.hasNext() || reader.pastSync(end)) {
+ key = null;
+ value = null;
+ return false;
+ }
+ if (key == null) {
+ key = new AvroWrapper<T>();
+ }
+ if (value == null) {
+ value = NullWritable.get();
+ }
+ key.datum(reader.next(key.datum()));
+ return true;
+ }
+
+ @Override
+ public AvroWrapper<T> getCurrentKey() throws IOException, InterruptedException {
+ return key;
+ }
+
+ @Override
+ public NullWritable getCurrentValue() throws IOException, InterruptedException {
+ return value;
+ }
+
+ @Override
+ public float getProgress() throws IOException {
+ if (end == start) {
+ return 0.0f;
+ } else {
+ return Math.min(1.0f, (getPos() - start) / (float) (end - start));
+ }
+ }
+
+ public long getPos() throws IOException {
+ return reader.tell();
+ }
+
+ @Override
+ public void close() throws IOException {
+ reader.close();
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroTableType.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroTableType.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroTableType.java
new file mode 100644
index 0000000..86613df
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroTableType.java
@@ -0,0 +1,151 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.IndexedRecord;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Pair;
+import org.apache.crunch.lib.PTables;
+import org.apache.crunch.types.PGroupedTableType;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.TupleDeepCopier;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+
+/**
+ * The implementation of the PTableType interface for Avro-based serialization.
+ *
+ */
+class AvroTableType<K, V> extends AvroType<Pair<K, V>> implements PTableType<K, V> {
+
+ private static class PairToAvroPair extends MapFn<Pair, org.apache.avro.mapred.Pair> {
+ private final MapFn keyMapFn;
+ private final MapFn valueMapFn;
+ private final String firstJson;
+ private final String secondJson;
+
+ private String pairSchemaJson;
+ private transient Schema pairSchema;
+
+ public PairToAvroPair(AvroType keyType, AvroType valueType) {
+ this.keyMapFn = keyType.getOutputMapFn();
+ this.firstJson = keyType.getSchema().toString();
+ this.valueMapFn = valueType.getOutputMapFn();
+ this.secondJson = valueType.getSchema().toString();
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ keyMapFn.configure(conf);
+ valueMapFn.configure(conf);
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ keyMapFn.setContext(context);
+ valueMapFn.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ keyMapFn.initialize();
+ valueMapFn.initialize();
+ pairSchemaJson = org.apache.avro.mapred.Pair.getPairSchema(
+ new Schema.Parser().parse(firstJson), new Schema.Parser().parse(secondJson)).toString();
+ }
+
+ @Override
+ public org.apache.avro.mapred.Pair map(Pair input) {
+ if (pairSchema == null) {
+ pairSchema = new Schema.Parser().parse(pairSchemaJson);
+ }
+ org.apache.avro.mapred.Pair avroPair = new org.apache.avro.mapred.Pair(pairSchema);
+ avroPair.key(keyMapFn.map(input.first()));
+ avroPair.value(valueMapFn.map(input.second()));
+ return avroPair;
+ }
+ }
+
+ private static class IndexedRecordToPair extends MapFn<IndexedRecord, Pair> {
+
+ private final MapFn firstMapFn;
+ private final MapFn secondMapFn;
+
+ public IndexedRecordToPair(MapFn firstMapFn, MapFn secondMapFn) {
+ this.firstMapFn = firstMapFn;
+ this.secondMapFn = secondMapFn;
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ firstMapFn.configure(conf);
+ secondMapFn.configure(conf);
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ firstMapFn.setContext(context);
+ secondMapFn.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ firstMapFn.initialize();
+ secondMapFn.initialize();
+ }
+
+ @Override
+ public Pair map(IndexedRecord input) {
+ return Pair.of(firstMapFn.map(input.get(0)), secondMapFn.map(input.get(1)));
+ }
+ }
+
+ private final AvroType<K> keyType;
+ private final AvroType<V> valueType;
+
+ public AvroTableType(AvroType<K> keyType, AvroType<V> valueType, Class<Pair<K, V>> pairClass) {
+ super(pairClass, org.apache.avro.mapred.Pair.getPairSchema(keyType.getSchema(),
+ valueType.getSchema()), new IndexedRecordToPair(keyType.getInputMapFn(),
+ valueType.getInputMapFn()), new PairToAvroPair(keyType, valueType), new TupleDeepCopier(
+ Pair.class, keyType, valueType), keyType, valueType);
+ this.keyType = keyType;
+ this.valueType = valueType;
+ }
+
+ @Override
+ public PType<K> getKeyType() {
+ return keyType;
+ }
+
+ @Override
+ public PType<V> getValueType() {
+ return valueType;
+ }
+
+ @Override
+ public PGroupedTableType<K, V> getGroupedTableType() {
+ return new AvroGroupedTableType<K, V>(this);
+ }
+
+ @Override
+ public Pair<K, V> getDetachedValue(Pair<K, V> value) {
+ return PTables.getDetachedValue(this, value);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroTextOutputFormat.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroTextOutputFormat.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroTextOutputFormat.java
new file mode 100644
index 0000000..4930235
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroTextOutputFormat.java
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import java.io.IOException;
+
+import org.apache.avro.mapred.AvroWrapper;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+
+public class AvroTextOutputFormat<K, V> extends TextOutputFormat<K, V> {
+ class DatumRecordTextWriter extends RecordWriter<K, V> {
+ private RecordWriter lineRecordWriter;
+
+ public DatumRecordTextWriter(RecordWriter recordWriter) {
+ this.lineRecordWriter = recordWriter;
+ }
+
+ @Override
+ public void close(TaskAttemptContext context) throws IOException, InterruptedException {
+ lineRecordWriter.close(context);
+ }
+
+ @Override
+ public void write(K arg0, V arg1) throws IOException, InterruptedException {
+ lineRecordWriter.write(getData(arg0), getData(arg1));
+ }
+
+ private Object getData(Object o) {
+ Object data = o;
+ if (o instanceof AvroWrapper) {
+ data = ((AvroWrapper) o).datum();
+ }
+ return data;
+ }
+ }
+
+ @Override
+ public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
+ RecordWriter<K, V> recordWriter = super.getRecordWriter(context);
+ return new DatumRecordTextWriter(recordWriter);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroType.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroType.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroType.java
new file mode 100644
index 0000000..a92b0d0
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroType.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import java.util.List;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.specific.SpecificRecord;
+import org.apache.commons.lang.builder.HashCodeBuilder;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.fn.IdentityFn;
+import org.apache.crunch.io.ReadableSourceTarget;
+import org.apache.crunch.io.avro.AvroFileSourceTarget;
+import org.apache.crunch.types.Converter;
+import org.apache.crunch.types.DeepCopier;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+
+/**
+ * The implementation of the PType interface for Avro-based serialization.
+ *
+ */
+public class AvroType<T> implements PType<T> {
+
+ private static final Converter AVRO_CONVERTER = new AvroKeyConverter();
+
+ private final Class<T> typeClass;
+ private final String schemaString;
+ private transient Schema schema;
+ private final MapFn baseInputMapFn;
+ private final MapFn baseOutputMapFn;
+ private final List<PType> subTypes;
+ private DeepCopier<T> deepCopier;
+ private boolean initialized = false;
+
+ public AvroType(Class<T> typeClass, Schema schema, DeepCopier<T> deepCopier, PType... ptypes) {
+ this(typeClass, schema, IdentityFn.getInstance(), IdentityFn.getInstance(), deepCopier, ptypes);
+ }
+
+ public AvroType(Class<T> typeClass, Schema schema, MapFn inputMapFn, MapFn outputMapFn,
+ DeepCopier<T> deepCopier, PType... ptypes) {
+ this.typeClass = typeClass;
+ this.schema = Preconditions.checkNotNull(schema);
+ this.schemaString = schema.toString();
+ this.baseInputMapFn = inputMapFn;
+ this.baseOutputMapFn = outputMapFn;
+ this.deepCopier = deepCopier;
+ this.subTypes = ImmutableList.<PType> builder().add(ptypes).build();
+ }
+
+ @Override
+ public Class<T> getTypeClass() {
+ return typeClass;
+ }
+
+ @Override
+ public PTypeFamily getFamily() {
+ return AvroTypeFamily.getInstance();
+ }
+
+ @Override
+ public List<PType> getSubTypes() {
+ return Lists.<PType> newArrayList(subTypes);
+ }
+
+ public Schema getSchema() {
+ if (schema == null) {
+ schema = new Schema.Parser().parse(schemaString);
+ }
+ return schema;
+ }
+
+ /**
+ * Determine if the wrapped type is a specific data avro type or wraps one.
+ *
+ * @return true if the wrapped type is a specific data type or wraps one
+ */
+ public boolean hasSpecific() {
+ if (Avros.isPrimitive(this)) {
+ return false;
+ }
+
+ if (!this.subTypes.isEmpty()) {
+ for (PType<?> subType : this.subTypes) {
+ AvroType<?> atype = (AvroType<?>) subType;
+ if (atype.hasSpecific()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ return SpecificRecord.class.isAssignableFrom(typeClass);
+ }
+
+ /**
+ * Determine if the wrapped type is a generic data avro type.
+ *
+ * @return true if the wrapped type is a generic type
+ */
+ public boolean isGeneric() {
+ return GenericData.Record.class.equals(typeClass);
+ }
+
+ /**
+ * Determine if the wrapped type is a reflection-based avro type or wraps one.
+ *
+ * @return true if the wrapped type is a reflection-based type or wraps one.
+ */
+ public boolean hasReflect() {
+ if (Avros.isPrimitive(this)) {
+ return false;
+ }
+
+ if (!this.subTypes.isEmpty()) {
+ for (PType<?> subType : this.subTypes) {
+ if (((AvroType<?>) subType).hasReflect()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ return !(typeClass.equals(GenericData.Record.class) || SpecificRecord.class
+ .isAssignableFrom(typeClass));
+ }
+
+ public MapFn<Object, T> getInputMapFn() {
+ return baseInputMapFn;
+ }
+
+ public MapFn<T, Object> getOutputMapFn() {
+ return baseOutputMapFn;
+ }
+
+ @Override
+ public Converter getConverter() {
+ return AVRO_CONVERTER;
+ }
+
+ @Override
+ public ReadableSourceTarget<T> getDefaultFileSource(Path path) {
+ return new AvroFileSourceTarget<T>(path, this);
+ }
+
+ @Override
+ public void initialize(Configuration conf) {
+ deepCopier.initialize(conf);
+ initialized = true;
+ }
+
+ @Override
+ public T getDetachedValue(T value) {
+ if (!initialized) {
+ throw new IllegalStateException("Cannot call getDetachedValue on an uninitialized PType");
+ }
+ return deepCopier.deepCopy(value);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == null || !(other instanceof AvroType)) {
+ return false;
+ }
+ AvroType at = (AvroType) other;
+ return (typeClass.equals(at.typeClass) && subTypes.equals(at.subTypes));
+
+ }
+
+ @Override
+ public int hashCode() {
+ HashCodeBuilder hcb = new HashCodeBuilder();
+ hcb.append(typeClass).append(subTypes);
+ return hcb.toHashCode();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroTypeFamily.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroTypeFamily.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroTypeFamily.java
new file mode 100644
index 0000000..e09e173
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroTypeFamily.java
@@ -0,0 +1,164 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.Map;
+
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Tuple;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+import org.apache.crunch.types.PGroupedTableType;
+import org.apache.crunch.types.PTableType;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypeFamily;
+import org.apache.crunch.types.PTypeUtils;
+
+public class AvroTypeFamily implements PTypeFamily {
+
+ private static final AvroTypeFamily INSTANCE = new AvroTypeFamily();
+
+ public static AvroTypeFamily getInstance() {
+ return INSTANCE;
+ }
+
+ // There can only be one instance.
+ private AvroTypeFamily() {
+ }
+
+ @Override
+ public PType<Void> nulls() {
+ return Avros.nulls();
+ }
+
+ @Override
+ public PType<String> strings() {
+ return Avros.strings();
+ }
+
+ @Override
+ public PType<Long> longs() {
+ return Avros.longs();
+ }
+
+ @Override
+ public PType<Integer> ints() {
+ return Avros.ints();
+ }
+
+ @Override
+ public PType<Float> floats() {
+ return Avros.floats();
+ }
+
+ @Override
+ public PType<Double> doubles() {
+ return Avros.doubles();
+ }
+
+ @Override
+ public PType<Boolean> booleans() {
+ return Avros.booleans();
+ }
+
+ @Override
+ public PType<ByteBuffer> bytes() {
+ return Avros.bytes();
+ }
+
+ @Override
+ public <T> PType<T> records(Class<T> clazz) {
+ return Avros.records(clazz);
+ }
+
+ public PType<GenericData.Record> generics(Schema schema) {
+ return Avros.generics(schema);
+ }
+
+ public <T> PType<T> containers(Class<T> clazz) {
+ return Avros.containers(clazz);
+ }
+
+ @Override
+ public <T> PType<Collection<T>> collections(PType<T> ptype) {
+ return Avros.collections(ptype);
+ }
+
+ @Override
+ public <T> PType<Map<String, T>> maps(PType<T> ptype) {
+ return Avros.maps(ptype);
+ }
+
+ @Override
+ public <V1, V2> PType<Pair<V1, V2>> pairs(PType<V1> p1, PType<V2> p2) {
+ return Avros.pairs(p1, p2);
+ }
+
+ @Override
+ public <V1, V2, V3> PType<Tuple3<V1, V2, V3>> triples(PType<V1> p1, PType<V2> p2, PType<V3> p3) {
+ return Avros.triples(p1, p2, p3);
+ }
+
+ @Override
+ public <V1, V2, V3, V4> PType<Tuple4<V1, V2, V3, V4>> quads(PType<V1> p1, PType<V2> p2, PType<V3> p3, PType<V4> p4) {
+ return Avros.quads(p1, p2, p3, p4);
+ }
+
+ @Override
+ public PType<TupleN> tuples(PType<?>... ptypes) {
+ return Avros.tuples(ptypes);
+ }
+
+ @Override
+ public <K, V> PTableType<K, V> tableOf(PType<K> key, PType<V> value) {
+ return Avros.tableOf(key, value);
+ }
+
+ @Override
+ public <T> PType<T> as(PType<T> ptype) {
+ if (ptype instanceof AvroType || ptype instanceof AvroGroupedTableType) {
+ return ptype;
+ }
+ if (ptype instanceof PGroupedTableType) {
+ PTableType ptt = ((PGroupedTableType) ptype).getTableType();
+ return new AvroGroupedTableType((AvroTableType) as(ptt));
+ }
+ Class<T> typeClass = ptype.getTypeClass();
+ PType<T> prim = Avros.getPrimitiveType(typeClass);
+ if (prim != null) {
+ return prim;
+ }
+ return PTypeUtils.convert(ptype, this);
+ }
+
+ @Override
+ public <T extends Tuple> PType<T> tuples(Class<T> clazz, PType<?>... ptypes) {
+ return Avros.tuples(clazz, ptypes);
+ }
+
+ @Override
+ public <S, T> PType<T> derived(Class<T> clazz, MapFn<S, T> inputFn, MapFn<T, S> outputFn, PType<S> base) {
+ return Avros.derived(clazz, inputFn, outputFn, base);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroUtf8InputFormat.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroUtf8InputFormat.java b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroUtf8InputFormat.java
new file mode 100644
index 0000000..9460fa5
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/avro/AvroUtf8InputFormat.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.avro;
+
+import java.io.IOException;
+
+import org.apache.avro.mapred.AvroWrapper;
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
+
+/**
+ * An {@link org.apache.hadoop.mapred.InputFormat} for text files. Each line is
+ * a {@link Utf8} key; values are null.
+ */
+public class AvroUtf8InputFormat extends FileInputFormat<AvroWrapper<Utf8>, NullWritable> {
+
+ static class Utf8LineRecordReader extends RecordReader<AvroWrapper<Utf8>, NullWritable> {
+
+ private LineRecordReader lineRecordReader;
+
+ private AvroWrapper<Utf8> currentKey = new AvroWrapper<Utf8>();
+
+ public Utf8LineRecordReader() throws IOException {
+ this.lineRecordReader = new LineRecordReader();
+ }
+
+ public void close() throws IOException {
+ lineRecordReader.close();
+ }
+
+ public float getProgress() throws IOException {
+ return lineRecordReader.getProgress();
+ }
+
+ @Override
+ public AvroWrapper<Utf8> getCurrentKey() throws IOException, InterruptedException {
+ Text txt = lineRecordReader.getCurrentValue();
+ currentKey.datum(new Utf8(txt.toString()));
+ return currentKey;
+ }
+
+ @Override
+ public NullWritable getCurrentValue() throws IOException, InterruptedException {
+ return NullWritable.get();
+ }
+
+ @Override
+ public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
+ lineRecordReader.initialize(split, context);
+ }
+
+ @Override
+ public boolean nextKeyValue() throws IOException, InterruptedException {
+ return lineRecordReader.nextKeyValue();
+ }
+ }
+
+ private CompressionCodecFactory compressionCodecs = null;
+
+ public void configure(Configuration conf) {
+ compressionCodecs = new CompressionCodecFactory(conf);
+ }
+
+ protected boolean isSplitable(FileSystem fs, Path file) {
+ return compressionCodecs.getCodec(file) == null;
+ }
+
+ @Override
+ public RecordReader<AvroWrapper<Utf8>, NullWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
+ throws IOException, InterruptedException {
+ return new Utf8LineRecordReader();
+ }
+}
[13/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchControlledJob.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchControlledJob.java b/crunch/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchControlledJob.java
deleted file mode 100644
index 93926c1..0000000
--- a/crunch/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchControlledJob.java
+++ /dev/null
@@ -1,325 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.hadoop.mapreduce.lib.jobcontrol;
-
-import java.io.IOException;
-import java.util.List;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.impl.mr.run.RuntimeParameters;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.JobID;
-import org.apache.hadoop.util.StringUtils;
-
-import com.google.common.base.Objects;
-import com.google.common.collect.Lists;
-
-/**
- * This class encapsulates a MapReduce job and its dependency. It monitors the
- * states of the depending jobs and updates the state of this job. A job starts
- * in the WAITING state. If it does not have any depending jobs, or all of the
- * depending jobs are in SUCCEEDED state, then the job state will become READY. If
- * any depending jobs fail, the job will fail too. When in READY state, the job
- * can be submitted to Hadoop for execution, with the state changing into
- * RUNNING state. From RUNNING state, the job can get into SUCCEEDED or FAILED
- * state, depending the status of the job execution.
- */
-public class CrunchControlledJob {
-
- // A job will be in one of the following states
- public static enum State {
- SUCCESS, WAITING, RUNNING, READY, FAILED, DEPENDENT_FAILED
- };
-
- public static interface Hook {
- public void run() throws IOException;
- }
-
- private static final Log LOG = LogFactory.getLog(CrunchControlledJob.class);
-
- private final int jobID;
- private final Job job; // mapreduce job to be executed.
- // the jobs the current job depends on
- private final List<CrunchControlledJob> dependingJobs;
- private final Hook prepareHook;
- private final Hook completionHook;
- private State state;
- // some info for human consumption, e.g. the reason why the job failed
- private String message;
- private String lastKnownProgress;
-
- /**
- * Construct a job.
- *
- * @param jobID
- * an ID used to match with its {@link org.apache.crunch.impl.mr.plan.JobPrototype}.
- * @param job
- * a mapreduce job to be executed.
- * @param prepareHook
- * a piece of code that will run before this job is submitted.
- * @param completionHook
- * a piece of code that will run after this job gets completed.
- */
- public CrunchControlledJob(int jobID, Job job, Hook prepareHook, Hook completionHook) {
- this.jobID = jobID;
- this.job = job;
- this.dependingJobs = Lists.newArrayList();
- this.prepareHook = prepareHook;
- this.completionHook = completionHook;
- this.state = State.WAITING;
- this.message = "just initialized";
- }
-
- @Override
- public String toString() {
- StringBuffer sb = new StringBuffer();
- sb.append("job name:\t").append(this.job.getJobName()).append("\n");
- sb.append("job id:\t").append(this.jobID).append("\n");
- sb.append("job state:\t").append(this.state).append("\n");
- sb.append("job mapred id:\t").append(this.job.getJobID()).append("\n");
- sb.append("job message:\t").append(this.message).append("\n");
-
- if (this.dependingJobs == null || this.dependingJobs.size() == 0) {
- sb.append("job has no depending job:\t").append("\n");
- } else {
- sb.append("job has ").append(this.dependingJobs.size())
- .append(" dependeng jobs:\n");
- for (int i = 0; i < this.dependingJobs.size(); i++) {
- sb.append("\t depending job ").append(i).append(":\t");
- sb.append((this.dependingJobs.get(i)).getJobName()).append("\n");
- }
- }
- return sb.toString();
- }
-
- /**
- * @return the job name of this job
- */
- public String getJobName() {
- return job.getJobName();
- }
-
- /**
- * Set the job name for this job.
- *
- * @param jobName
- * the job name
- */
- public void setJobName(String jobName) {
- job.setJobName(jobName);
- }
-
- /**
- * @return the job ID of this job
- */
- public int getJobID() {
- return this.jobID;
- }
-
- /**
- * @return the mapred ID of this job as assigned by the mapred framework.
- */
- public JobID getMapredJobID() {
- return this.job.getJobID();
- }
-
- /**
- * @return the mapreduce job
- */
- public synchronized Job getJob() {
- return this.job;
- }
-
- /**
- * @return the state of this job
- */
- public synchronized State getJobState() {
- return this.state;
- }
-
- /**
- * Set the state for this job.
- *
- * @param state
- * the new state for this job.
- */
- protected synchronized void setJobState(State state) {
- this.state = state;
- }
-
- /**
- * @return the message of this job
- */
- public synchronized String getMessage() {
- return this.message;
- }
-
- /**
- * Set the message for this job.
- *
- * @param message
- * the message for this job.
- */
- public synchronized void setMessage(String message) {
- this.message = message;
- }
-
- /**
- * @return the depending jobs of this job
- */
- public List<CrunchControlledJob> getDependentJobs() {
- return this.dependingJobs;
- }
-
- /**
- * Add a job to this jobs' dependency list. Dependent jobs can only be added
- * while a Job is waiting to run, not during or afterwards.
- *
- * @param dependingJob
- * Job that this Job depends on.
- * @return <tt>true</tt> if the Job was added.
- */
- public synchronized boolean addDependingJob(CrunchControlledJob dependingJob) {
- if (this.state == State.WAITING) { // only allowed to add jobs when waiting
- return this.dependingJobs.add(dependingJob);
- } else {
- return false;
- }
- }
-
- /**
- * @return true if this job is in a complete state
- */
- public synchronized boolean isCompleted() {
- return this.state == State.FAILED || this.state == State.DEPENDENT_FAILED
- || this.state == State.SUCCESS;
- }
-
- /**
- * @return true if this job is in READY state
- */
- public synchronized boolean isReady() {
- return this.state == State.READY;
- }
-
- public void killJob() throws IOException, InterruptedException {
- job.killJob();
- }
-
- /**
- * Check the state of this running job. The state may remain the same, become
- * SUCCEEDED or FAILED.
- */
- private void checkRunningState() throws IOException, InterruptedException {
- try {
- if (job.isComplete()) {
- if (job.isSuccessful()) {
- this.state = State.SUCCESS;
- } else {
- this.state = State.FAILED;
- this.message = "Job failed!";
- }
- } else {
- // still running
- if (job.getConfiguration().getBoolean(RuntimeParameters.LOG_JOB_PROGRESS, false)) {
- logJobProgress();
- }
- }
- } catch (IOException ioe) {
- this.state = State.FAILED;
- this.message = StringUtils.stringifyException(ioe);
- try {
- if (job != null) {
- job.killJob();
- }
- } catch (IOException e) {
- }
- }
- if (isCompleted()) {
- completionHook.run();
- }
- }
-
- /**
- * Check and update the state of this job. The state changes depending on its
- * current state and the states of the depending jobs.
- */
- synchronized State checkState() throws IOException, InterruptedException {
- if (this.state == State.RUNNING) {
- checkRunningState();
- }
- if (this.state != State.WAITING) {
- return this.state;
- }
- if (this.dependingJobs == null || this.dependingJobs.size() == 0) {
- this.state = State.READY;
- return this.state;
- }
- CrunchControlledJob pred = null;
- int n = this.dependingJobs.size();
- for (int i = 0; i < n; i++) {
- pred = this.dependingJobs.get(i);
- State s = pred.checkState();
- if (s == State.WAITING || s == State.READY || s == State.RUNNING) {
- break; // a pred is still not completed, continue in WAITING
- // state
- }
- if (s == State.FAILED || s == State.DEPENDENT_FAILED) {
- this.state = State.DEPENDENT_FAILED;
- this.message = "depending job " + i + " with jobID " + pred.getJobID()
- + " failed. " + pred.getMessage();
- break;
- }
- // pred must be in success state
- if (i == n - 1) {
- this.state = State.READY;
- }
- }
-
- return this.state;
- }
-
- /**
- * Submit this job to mapred. The state becomes RUNNING if submission is
- * successful, FAILED otherwise.
- */
- protected synchronized void submit() {
- try {
- prepareHook.run();
- job.submit();
- this.state = State.RUNNING;
- LOG.info("Running job \"" + getJobName() + "\"");
- LOG.info("Job status available at: " + job.getTrackingURL());
- } catch (Exception ioe) {
- this.state = State.FAILED;
- this.message = StringUtils.stringifyException(ioe);
- LOG.info("Error occurred starting job \"" + getJobName() + "\":");
- LOG.info(getMessage());
- }
- }
-
- private void logJobProgress() throws IOException, InterruptedException {
- String progress = String.format("map %.0f%% reduce %.0f%%",
- 100.0 * job.mapProgress(), 100.0 * job.reduceProgress());
- if (!Objects.equal(lastKnownProgress, progress)) {
- LOG.info(job.getJobName() + " progress: " + progress);
- lastKnownProgress = progress;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchJobControl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchJobControl.java b/crunch/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchJobControl.java
deleted file mode 100644
index 727ab6f..0000000
--- a/crunch/src/main/java/org/apache/crunch/hadoop/mapreduce/lib/jobcontrol/CrunchJobControl.java
+++ /dev/null
@@ -1,211 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.hadoop.mapreduce.lib.jobcontrol;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Hashtable;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.hadoop.mapreduce.lib.jobcontrol.CrunchControlledJob.State;
-
-/**
- * This class encapsulates a set of MapReduce jobs and its dependency.
- *
- * It tracks the states of the jobs by placing them into different tables
- * according to their states.
- *
- * This class provides APIs for the client app to add a job to the group and to
- * get the jobs in the group in different states. When a job is added, an ID
- * unique to the group is assigned to the job.
- */
-public class CrunchJobControl {
-
- private Map<Integer, CrunchControlledJob> waitingJobs;
- private Map<Integer, CrunchControlledJob> readyJobs;
- private Map<Integer, CrunchControlledJob> runningJobs;
- private Map<Integer, CrunchControlledJob> successfulJobs;
- private Map<Integer, CrunchControlledJob> failedJobs;
-
- private Log log = LogFactory.getLog(CrunchJobControl.class);
-
- private final String groupName;
-
- /**
- * Construct a job control for a group of jobs.
- *
- * @param groupName
- * a name identifying this group
- */
- public CrunchJobControl(String groupName) {
- this.waitingJobs = new Hashtable<Integer, CrunchControlledJob>();
- this.readyJobs = new Hashtable<Integer, CrunchControlledJob>();
- this.runningJobs = new Hashtable<Integer, CrunchControlledJob>();
- this.successfulJobs = new Hashtable<Integer, CrunchControlledJob>();
- this.failedJobs = new Hashtable<Integer, CrunchControlledJob>();
- this.groupName = groupName;
- }
-
- private static List<CrunchControlledJob> toList(Map<Integer, CrunchControlledJob> jobs) {
- ArrayList<CrunchControlledJob> retv = new ArrayList<CrunchControlledJob>();
- synchronized (jobs) {
- for (CrunchControlledJob job : jobs.values()) {
- retv.add(job);
- }
- }
- return retv;
- }
-
- /**
- * @return the jobs in the waiting state
- */
- public List<CrunchControlledJob> getWaitingJobList() {
- return toList(this.waitingJobs);
- }
-
- /**
- * @return the jobs in the running state
- */
- public List<CrunchControlledJob> getRunningJobList() {
- return toList(this.runningJobs);
- }
-
- /**
- * @return the jobs in the ready state
- */
- public List<CrunchControlledJob> getReadyJobsList() {
- return toList(this.readyJobs);
- }
-
- /**
- * @return the jobs in the success state
- */
- public List<CrunchControlledJob> getSuccessfulJobList() {
- return toList(this.successfulJobs);
- }
-
- public List<CrunchControlledJob> getFailedJobList() {
- return toList(this.failedJobs);
- }
-
- private static void addToQueue(CrunchControlledJob aJob,
- Map<Integer, CrunchControlledJob> queue) {
- synchronized (queue) {
- queue.put(aJob.getJobID(), aJob);
- }
- }
-
- private void addToQueue(CrunchControlledJob aJob) {
- Map<Integer, CrunchControlledJob> queue = getQueue(aJob.getJobState());
- addToQueue(aJob, queue);
- }
-
- private Map<Integer, CrunchControlledJob> getQueue(State state) {
- Map<Integer, CrunchControlledJob> retv = null;
- if (state == State.WAITING) {
- retv = this.waitingJobs;
- } else if (state == State.READY) {
- retv = this.readyJobs;
- } else if (state == State.RUNNING) {
- retv = this.runningJobs;
- } else if (state == State.SUCCESS) {
- retv = this.successfulJobs;
- } else if (state == State.FAILED || state == State.DEPENDENT_FAILED) {
- retv = this.failedJobs;
- }
- return retv;
- }
-
- /**
- * Add a new job.
- *
- * @param aJob
- * the new job
- */
- synchronized public void addJob(CrunchControlledJob aJob) {
- aJob.setJobState(State.WAITING);
- this.addToQueue(aJob);
- }
-
- synchronized private void checkRunningJobs() throws IOException,
- InterruptedException {
-
- Map<Integer, CrunchControlledJob> oldJobs = null;
- oldJobs = this.runningJobs;
- this.runningJobs = new Hashtable<Integer, CrunchControlledJob>();
-
- for (CrunchControlledJob nextJob : oldJobs.values()) {
- nextJob.checkState();
- this.addToQueue(nextJob);
- }
- }
-
- synchronized private void checkWaitingJobs() throws IOException,
- InterruptedException {
- Map<Integer, CrunchControlledJob> oldJobs = null;
- oldJobs = this.waitingJobs;
- this.waitingJobs = new Hashtable<Integer, CrunchControlledJob>();
-
- for (CrunchControlledJob nextJob : oldJobs.values()) {
- nextJob.checkState();
- this.addToQueue(nextJob);
- }
- }
-
- synchronized private void startReadyJobs() {
- Map<Integer, CrunchControlledJob> oldJobs = null;
- oldJobs = this.readyJobs;
- this.readyJobs = new Hashtable<Integer, CrunchControlledJob>();
-
- for (CrunchControlledJob nextJob : oldJobs.values()) {
- // Submitting Job to Hadoop
- nextJob.submit();
- this.addToQueue(nextJob);
- }
- }
-
- synchronized public void killAllRunningJobs() {
- for (CrunchControlledJob job : runningJobs.values()) {
- if (!job.isCompleted()) {
- try {
- job.killJob();
- } catch (Exception e) {
- log.error("Exception killing job: " + job.getJobName(), e);
- }
- }
- }
- }
-
- synchronized public boolean allFinished() {
- return this.waitingJobs.size() == 0 && this.readyJobs.size() == 0
- && this.runningJobs.size() == 0;
- }
-
- /**
- * Checks the states of the running jobs Update the states of waiting jobs, and submits the jobs in
- * ready state (i.e. whose dependencies are all finished in success).
- */
- public void pollJobStatusAndStartNewOnes() throws IOException, InterruptedException {
- checkRunningJobs();
- checkWaitingJobs();
- startReadyJobs();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/SingleUseIterable.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/SingleUseIterable.java b/crunch/src/main/java/org/apache/crunch/impl/SingleUseIterable.java
deleted file mode 100644
index 98f982f..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/SingleUseIterable.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl;
-
-import java.util.Iterator;
-
-/**
- * Wrapper around a Reducer's input Iterable. Ensures that the
- * {@link #iterator()} method is not called more than once.
- */
-public class SingleUseIterable<T> implements Iterable<T> {
-
- private boolean used = false;
- private Iterable<T> wrappedIterable;
-
- /**
- * Instantiate around an Iterable that may only be used once.
- *
- * @param toWrap iterable to wrap
- */
- public SingleUseIterable(Iterable<T> toWrap) {
- this.wrappedIterable = toWrap;
- }
-
- @Override
- public Iterator<T> iterator() {
- if (used) {
- throw new IllegalStateException("iterator() can only be called once on this Iterable");
- }
- used = true;
- return wrappedIterable.iterator();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mem/MemPipeline.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mem/MemPipeline.java b/crunch/src/main/java/org/apache/crunch/impl/mem/MemPipeline.java
deleted file mode 100644
index 272b2af..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mem/MemPipeline.java
+++ /dev/null
@@ -1,275 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mem;
-
-import java.io.IOException;
-import java.util.List;
-import java.util.Set;
-import java.util.concurrent.TimeUnit;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.PipelineExecution;
-import org.apache.crunch.PipelineResult;
-import org.apache.crunch.Source;
-import org.apache.crunch.TableSource;
-import org.apache.crunch.Target;
-import org.apache.crunch.Target.WriteMode;
-import org.apache.crunch.impl.mem.collect.MemCollection;
-import org.apache.crunch.impl.mem.collect.MemTable;
-import org.apache.crunch.io.At;
-import org.apache.crunch.io.PathTarget;
-import org.apache.crunch.io.ReadableSource;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Counters;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-
-public class MemPipeline implements Pipeline {
-
- private static final Log LOG = LogFactory.getLog(MemPipeline.class);
- private static Counters COUNTERS = new Counters();
- private static final MemPipeline INSTANCE = new MemPipeline();
-
- private int outputIndex = 0;
-
- public static Counters getCounters() {
- return COUNTERS;
- }
-
- public static void clearCounters() {
- COUNTERS = new Counters();
- }
-
- public static Pipeline getInstance() {
- return INSTANCE;
- }
-
- public static <T> PCollection<T> collectionOf(T... ts) {
- return new MemCollection<T>(ImmutableList.copyOf(ts));
- }
-
- public static <T> PCollection<T> collectionOf(Iterable<T> collect) {
- return new MemCollection<T>(collect);
- }
-
- public static <T> PCollection<T> typedCollectionOf(PType<T> ptype, T... ts) {
- return new MemCollection<T>(ImmutableList.copyOf(ts), ptype, null);
- }
-
- public static <T> PCollection<T> typedCollectionOf(PType<T> ptype, Iterable<T> collect) {
- return new MemCollection<T>(collect, ptype, null);
- }
-
- public static <S, T> PTable<S, T> tableOf(S s, T t, Object... more) {
- List<Pair<S, T>> pairs = Lists.newArrayList();
- pairs.add(Pair.of(s, t));
- for (int i = 0; i < more.length; i += 2) {
- pairs.add(Pair.of((S) more[i], (T) more[i + 1]));
- }
- return new MemTable<S, T>(pairs);
- }
-
- public static <S, T> PTable<S, T> typedTableOf(PTableType<S, T> ptype, S s, T t, Object... more) {
- List<Pair<S, T>> pairs = Lists.newArrayList();
- pairs.add(Pair.of(s, t));
- for (int i = 0; i < more.length; i += 2) {
- pairs.add(Pair.of((S) more[i], (T) more[i + 1]));
- }
- return new MemTable<S, T>(pairs, ptype, null);
- }
-
- public static <S, T> PTable<S, T> tableOf(Iterable<Pair<S, T>> pairs) {
- return new MemTable<S, T>(pairs);
- }
-
- public static <S, T> PTable<S, T> typedTableOf(PTableType<S, T> ptype, Iterable<Pair<S, T>> pairs) {
- return new MemTable<S, T>(pairs, ptype, null);
- }
-
- private Configuration conf = new Configuration();
- private Set<Target> activeTargets = Sets.newHashSet();
-
- private MemPipeline() {
- }
-
- @Override
- public void setConfiguration(Configuration conf) {
- this.conf = conf;
- }
-
- @Override
- public Configuration getConfiguration() {
- return conf;
- }
-
- @Override
- public <T> PCollection<T> read(Source<T> source) {
- if (source instanceof ReadableSource) {
- try {
- Iterable<T> iterable = ((ReadableSource<T>) source).read(conf);
- return new MemCollection<T>(iterable, source.getType(), source.toString());
- } catch (IOException e) {
- LOG.error("Exception reading source: " + source.toString(), e);
- throw new IllegalStateException(e);
- }
- }
- LOG.error("Source " + source + " is not readable");
- throw new IllegalStateException("Source " + source + " is not readable");
- }
-
- @Override
- public <K, V> PTable<K, V> read(TableSource<K, V> source) {
- if (source instanceof ReadableSource) {
- try {
- Iterable<Pair<K, V>> iterable = ((ReadableSource<Pair<K, V>>) source).read(conf);
- return new MemTable<K, V>(iterable, source.getTableType(), source.toString());
- } catch (IOException e) {
- LOG.error("Exception reading source: " + source.toString(), e);
- throw new IllegalStateException(e);
- }
- }
- LOG.error("Source " + source + " is not readable");
- throw new IllegalStateException("Source " + source + " is not readable");
- }
-
- @Override
- public void write(PCollection<?> collection, Target target) {
- write(collection, target, Target.WriteMode.DEFAULT);
- }
-
- @Override
- public void write(PCollection<?> collection, Target target,
- Target.WriteMode writeMode) {
- target.handleExisting(writeMode, getConfiguration());
- if (writeMode != WriteMode.APPEND && activeTargets.contains(target)) {
- throw new CrunchRuntimeException("Target " + target + " is already written in the current run." +
- " Use WriteMode.APPEND in order to write additional data to it.");
- }
- activeTargets.add(target);
- if (target instanceof PathTarget) {
- Path path = ((PathTarget) target).getPath();
- try {
- FileSystem fs = path.getFileSystem(conf);
- FSDataOutputStream os = fs.create(new Path(path, "out" + outputIndex));
- outputIndex++;
- if (collection instanceof PTable) {
- for (Object o : collection.materialize()) {
- Pair p = (Pair) o;
- os.writeBytes(p.first().toString());
- os.writeBytes("\t");
- os.writeBytes(p.second().toString());
- os.writeBytes("\r\n");
- }
- } else {
- for (Object o : collection.materialize()) {
- os.writeBytes(o.toString() + "\r\n");
- }
- }
- os.close();
- } catch (IOException e) {
- LOG.error("Exception writing target: " + target, e);
- }
- } else {
- LOG.error("Target " + target + " is not a PathTarget instance");
- }
- }
-
- @Override
- public PCollection<String> readTextFile(String pathName) {
- return read(At.textFile(pathName));
- }
-
- @Override
- public <T> void writeTextFile(PCollection<T> collection, String pathName) {
- write(collection, At.textFile(pathName));
- }
-
- @Override
- public <T> Iterable<T> materialize(PCollection<T> pcollection) {
- return pcollection.materialize();
- }
-
- @Override
- public PipelineExecution runAsync() {
- activeTargets.clear();
- return new PipelineExecution() {
- @Override
- public String getPlanDotFile() {
- return "";
- }
-
- @Override
- public void waitFor(long timeout, TimeUnit timeUnit) throws InterruptedException {
- // no-po
- }
-
- @Override
- public void waitUntilDone() throws InterruptedException {
- // no-po
- }
-
- @Override
- public Status getStatus() {
- return Status.SUCCEEDED;
- }
-
- @Override
- public PipelineResult getResult() {
- return new PipelineResult(ImmutableList.of(new PipelineResult.StageResult("MemPipelineStage", COUNTERS)));
- }
-
- @Override
- public void kill() {
- }
- };
- }
-
- @Override
- public PipelineResult run() {
- activeTargets.clear();
- return new PipelineResult(ImmutableList.of(new PipelineResult.StageResult("MemPipelineStage", COUNTERS)));
- }
-
- @Override
- public PipelineResult done() {
- return run();
- }
-
- @Override
- public void enableDebug() {
- LOG.info("Note: in-memory pipelines do not have debug logging");
- }
-
- @Override
- public String getName() {
- return "Memory Pipeline";
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mem/collect/MemCollection.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mem/collect/MemCollection.java b/crunch/src/main/java/org/apache/crunch/impl/mem/collect/MemCollection.java
deleted file mode 100644
index c97fac6..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mem/collect/MemCollection.java
+++ /dev/null
@@ -1,295 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mem.collect;
-
-import java.lang.reflect.Method;
-import java.util.Collection;
-
-import javassist.util.proxy.MethodFilter;
-import javassist.util.proxy.MethodHandler;
-import javassist.util.proxy.ProxyFactory;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.FilterFn;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PObject;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.ParallelDoOptions;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.Target;
-import org.apache.crunch.fn.ExtractKeyFn;
-import org.apache.crunch.impl.mem.MemPipeline;
-import org.apache.crunch.impl.mem.emit.InMemoryEmitter;
-import org.apache.crunch.lib.Aggregate;
-import org.apache.crunch.materialize.pobject.CollectionPObject;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.mapreduce.OutputCommitter;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.hadoop.mapreduce.StatusReporter;
-import org.apache.hadoop.mapreduce.TaskAttemptID;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.Lists;
-
-public class MemCollection<S> implements PCollection<S> {
-
- private final Collection<S> collect;
- private final PType<S> ptype;
- private String name;
-
- public MemCollection(Iterable<S> collect) {
- this(collect, null, null);
- }
-
- public MemCollection(Iterable<S> collect, PType<S> ptype) {
- this(collect, ptype, null);
- }
-
- public MemCollection(Iterable<S> collect, PType<S> ptype, String name) {
- this.collect = ImmutableList.copyOf(collect);
- this.ptype = ptype;
- this.name = name;
- }
-
- @Override
- public Pipeline getPipeline() {
- return MemPipeline.getInstance();
- }
-
- @Override
- public PCollection<S> union(PCollection<S> other) {
- return union(new PCollection[] { other });
- }
-
- @Override
- public PCollection<S> union(PCollection<S>... collections) {
- Collection<S> output = Lists.newArrayList();
- for (PCollection<S> pcollect : collections) {
- for (S s : pcollect.materialize()) {
- output.add(s);
- }
- }
- output.addAll(collect);
- return new MemCollection<S>(output, collections[0].getPType());
- }
-
- @Override
- public <T> PCollection<T> parallelDo(DoFn<S, T> doFn, PType<T> type) {
- return parallelDo(null, doFn, type);
- }
-
- @Override
- public <T> PCollection<T> parallelDo(String name, DoFn<S, T> doFn, PType<T> type) {
- return parallelDo(name, doFn, type, ParallelDoOptions.builder().build());
- }
-
- @Override
- public <T> PCollection<T> parallelDo(String name, DoFn<S, T> doFn, PType<T> type,
- ParallelDoOptions options) {
- InMemoryEmitter<T> emitter = new InMemoryEmitter<T>();
- doFn.setContext(getInMemoryContext(getPipeline().getConfiguration()));
- doFn.initialize();
- for (S s : collect) {
- doFn.process(s, emitter);
- }
- doFn.cleanup(emitter);
- return new MemCollection<T>(emitter.getOutput(), type, name);
- }
-
- @Override
- public <K, V> PTable<K, V> parallelDo(DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type) {
- return parallelDo(null, doFn, type);
- }
-
- @Override
- public <K, V> PTable<K, V> parallelDo(String name, DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type) {
- return parallelDo(name, doFn, type, ParallelDoOptions.builder().build());
- }
-
- @Override
- public <K, V> PTable<K, V> parallelDo(String name, DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type,
- ParallelDoOptions options) {
- InMemoryEmitter<Pair<K, V>> emitter = new InMemoryEmitter<Pair<K, V>>();
- doFn.setContext(getInMemoryContext(getPipeline().getConfiguration()));
- doFn.initialize();
- for (S s : collect) {
- doFn.process(s, emitter);
- }
- doFn.cleanup(emitter);
- return new MemTable<K, V>(emitter.getOutput(), type, name);
- }
-
- @Override
- public PCollection<S> write(Target target) {
- getPipeline().write(this, target);
- return this;
- }
-
- @Override
- public PCollection<S> write(Target target, Target.WriteMode writeMode) {
- getPipeline().write(this, target, writeMode);
- return this;
- }
-
- @Override
- public Iterable<S> materialize() {
- return collect;
- }
-
- /** {@inheritDoc} */
- @Override
- public PObject<Collection<S>> asCollection() {
- return new CollectionPObject<S>(this);
- }
-
- public Collection<S> getCollection() {
- return collect;
- }
-
- @Override
- public PType<S> getPType() {
- return ptype;
- }
-
- @Override
- public PTypeFamily getTypeFamily() {
- if (ptype != null) {
- return ptype.getFamily();
- }
- return null;
- }
-
- @Override
- public long getSize() {
- return collect.isEmpty() ? 0 : 1; // getSize is only used for pipeline optimization in MR
- }
-
- @Override
- public String getName() {
- return name;
- }
-
- @Override
- public String toString() {
- return collect.toString();
- }
-
- @Override
- public PTable<S, Long> count() {
- return Aggregate.count(this);
- }
-
- @Override
- public PObject<Long> length() {
- return Aggregate.length(this);
- }
-
- @Override
- public PObject<S> max() {
- return Aggregate.max(this);
- }
-
- @Override
- public PObject<S> min() {
- return Aggregate.min(this);
- }
-
- @Override
- public PCollection<S> filter(FilterFn<S> filterFn) {
- return parallelDo(filterFn, getPType());
- }
-
- @Override
- public PCollection<S> filter(String name, FilterFn<S> filterFn) {
- return parallelDo(name, filterFn, getPType());
- }
-
- @Override
- public <K> PTable<K, S> by(MapFn<S, K> mapFn, PType<K> keyType) {
- return parallelDo(new ExtractKeyFn<K, S>(mapFn), getTypeFamily().tableOf(keyType, getPType()));
- }
-
- @Override
- public <K> PTable<K, S> by(String name, MapFn<S, K> mapFn, PType<K> keyType) {
- return parallelDo(name, new ExtractKeyFn<K, S>(mapFn), getTypeFamily().tableOf(keyType, getPType()));
- }
-
- /**
- * The method creates a {@link TaskInputOutputContext} that will just provide
- * {@linkplain Configuration}. The method has been implemented with javaassist
- * as there are API changes in versions of Hadoop. In hadoop 1.0.3 the
- * {@linkplain TaskInputOutputContext} is abstract class while in version 2
- * the same is an interface.
- * <p>
- * Note: The intention of this is to provide the bare essentials that are
- * required to make the {@linkplain MemPipeline} work. It lacks even the basic
- * things that can proved some support for unit testing pipeline.
- */
- private static TaskInputOutputContext<?, ?, ?, ?> getInMemoryContext(final Configuration conf) {
- ProxyFactory factory = new ProxyFactory();
- Class<TaskInputOutputContext> superType = TaskInputOutputContext.class;
- Class[] types = new Class[0];
- Object[] args = new Object[0];
- if (superType.isInterface()) {
- factory.setInterfaces(new Class[] { superType });
- } else {
- types = new Class[] { Configuration.class, TaskAttemptID.class, RecordWriter.class, OutputCommitter.class,
- StatusReporter.class };
- args = new Object[] { conf, new TaskAttemptID(), null, null, null };
- factory.setSuperclass(superType);
- }
- factory.setFilter(new MethodFilter() {
- @Override
- public boolean isHandled(Method m) {
- String name = m.getName();
- return "getConfiguration".equals(name) || "getCounter".equals(name) || "progress".equals(name);
- }
- });
- MethodHandler handler = new MethodHandler() {
- @Override
- public Object invoke(Object arg0, Method m, Method arg2, Object[] args) throws Throwable {
- String name = m.getName();
- if ("getConfiguration".equals(name)) {
- return conf;
- } else if ("progress".equals(name)) {
- // no-op
- return null;
- } else { // getCounter
- if (args.length == 1) {
- return MemPipeline.getCounters().findCounter((Enum<?>) args[0]);
- } else {
- return MemPipeline.getCounters().findCounter((String) args[0], (String) args[1]);
- }
- }
- }
- };
- try {
- Object newInstance = factory.create(types, args, handler);
- return (TaskInputOutputContext<?, ?, ?, ?>) newInstance;
- } catch (Exception e) {
- e.printStackTrace();
- throw new RuntimeException(e);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mem/collect/MemGroupedTable.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mem/collect/MemGroupedTable.java b/crunch/src/main/java/org/apache/crunch/impl/mem/collect/MemGroupedTable.java
deleted file mode 100644
index d105bb4..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mem/collect/MemGroupedTable.java
+++ /dev/null
@@ -1,113 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mem.collect;
-
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-
-import org.apache.crunch.Aggregator;
-import org.apache.crunch.CombineFn;
-import org.apache.crunch.GroupingOptions;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PGroupedTable;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.Target;
-import org.apache.crunch.fn.Aggregators;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.hadoop.io.RawComparator;
-import org.apache.hadoop.util.ReflectionUtils;
-
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-class MemGroupedTable<K, V> extends MemCollection<Pair<K, Iterable<V>>> implements PGroupedTable<K, V> {
-
- private final MemTable<K, V> parent;
-
- private static <S, T> Iterable<Pair<S, Iterable<T>>> buildMap(MemTable<S, T> parent, GroupingOptions options) {
- PType<S> keyType = parent.getKeyType();
- Shuffler<S, T> shuffler = Shuffler.create(keyType, options, parent.getPipeline());
-
- for (Pair<S, T> pair : parent.materialize()) {
- shuffler.add(pair);
- }
-
- return shuffler;
- }
-
- public MemGroupedTable(MemTable<K, V> parent, GroupingOptions options) {
- super(buildMap(parent, options));
- this.parent = parent;
- }
-
- @Override
- public PCollection<Pair<K, Iterable<V>>> union(PCollection<Pair<K, Iterable<V>>>... collections) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public PCollection<Pair<K, Iterable<V>>> write(Target target) {
- getPipeline().write(this.ungroup(), target);
- return this;
- }
-
- @Override
- public PType<Pair<K, Iterable<V>>> getPType() {
- PTableType<K, V> parentType = parent.getPTableType();
- if (parentType != null) {
- return parentType.getGroupedTableType();
- }
- return null;
- }
-
- @Override
- public PTypeFamily getTypeFamily() {
- return parent.getTypeFamily();
- }
-
- @Override
- public long getSize() {
- return 1; // getSize is only used for pipeline optimization in MR
- }
-
- @Override
- public String getName() {
- return "MemGrouped(" + parent.getName() + ")";
- }
-
- @Override
- public PTable<K, V> combineValues(CombineFn<K, V> combineFn) {
- return parallelDo(combineFn, parent.getPTableType());
- }
-
- @Override
- public PTable<K, V> combineValues(Aggregator<V> agg) {
- return combineValues(Aggregators.<K, V>toCombineFn(agg));
- }
-
- @Override
- public PTable<K, V> ungroup() {
- return parent;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mem/collect/MemTable.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mem/collect/MemTable.java b/crunch/src/main/java/org/apache/crunch/impl/mem/collect/MemTable.java
deleted file mode 100644
index f8a5960..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mem/collect/MemTable.java
+++ /dev/null
@@ -1,177 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mem.collect;
-
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.crunch.FilterFn;
-import org.apache.crunch.GroupingOptions;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PGroupedTable;
-import org.apache.crunch.PObject;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Target;
-import org.apache.crunch.lib.Aggregate;
-import org.apache.crunch.lib.Cogroup;
-import org.apache.crunch.lib.Join;
-import org.apache.crunch.lib.PTables;
-import org.apache.crunch.materialize.MaterializableMap;
-import org.apache.crunch.materialize.pobject.MapPObject;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-
-import com.google.common.collect.Lists;
-
-public class MemTable<K, V> extends MemCollection<Pair<K, V>> implements PTable<K, V> {
-
- private PTableType<K, V> ptype;
-
- public MemTable(Iterable<Pair<K, V>> collect) {
- this(collect, null, null);
- }
-
- public MemTable(Iterable<Pair<K, V>> collect, PTableType<K, V> ptype, String name) {
- super(collect, ptype, name);
- this.ptype = ptype;
- }
-
- @Override
- public PTable<K, V> union(PTable<K, V> other) {
- return union(new PTable[] { other });
- }
-
- @Override
- public PTable<K, V> union(PTable<K, V>... others) {
- List<Pair<K, V>> values = Lists.newArrayList();
- values.addAll(getCollection());
- for (PTable<K, V> ptable : others) {
- for (Pair<K, V> p : ptable.materialize()) {
- values.add(p);
- }
- }
- return new MemTable<K, V>(values, others[0].getPTableType(), null);
- }
-
- @Override
- public PGroupedTable<K, V> groupByKey() {
- return groupByKey(null);
- }
-
- @Override
- public PGroupedTable<K, V> groupByKey(int numPartitions) {
- return groupByKey(null);
- }
-
- @Override
- public PGroupedTable<K, V> groupByKey(GroupingOptions options) {
- return new MemGroupedTable<K, V>(this, options);
- }
-
- @Override
- public PTable<K, V> write(Target target) {
- super.write(target);
- return this;
- }
-
- @Override
- public PTable<K, V> write(Target target, Target.WriteMode writeMode) {
- getPipeline().write(this, target, writeMode);
- return this;
- }
-
- @Override
- public PTableType<K, V> getPTableType() {
- return ptype;
- }
-
- @Override
- public PType<K> getKeyType() {
- if (ptype != null) {
- return ptype.getKeyType();
- }
- return null;
- }
-
- @Override
- public PType<V> getValueType() {
- if (ptype != null) {
- return ptype.getValueType();
- }
- return null;
- }
-
- @Override
- public PTable<K, V> filter(FilterFn<Pair<K, V>> filterFn) {
- return parallelDo(filterFn, getPTableType());
- }
-
- @Override
- public PTable<K, V> filter(String name, FilterFn<Pair<K, V>> filterFn) {
- return parallelDo(name, filterFn, getPTableType());
- }
-
- @Override
- public PTable<K, V> top(int count) {
- return Aggregate.top(this, count, true);
- }
-
- @Override
- public PTable<K, V> bottom(int count) {
- return Aggregate.top(this, count, false);
- }
-
- @Override
- public PTable<K, Collection<V>> collectValues() {
- return Aggregate.collectValues(this);
- }
-
- @Override
- public <U> PTable<K, Pair<V, U>> join(PTable<K, U> other) {
- return Join.join(this, other);
- }
-
- @Override
- public <U> PTable<K, Pair<Collection<V>, Collection<U>>> cogroup(PTable<K, U> other) {
- return Cogroup.cogroup(this, other);
- }
-
- @Override
- public PCollection<K> keys() {
- return PTables.keys(this);
- }
-
- @Override
- public PCollection<V> values() {
- return PTables.values(this);
- }
-
- @Override
- public Map<K, V> materializeToMap() {
- return new MaterializableMap<K, V>(this.materialize());
- }
-
- /** {@inheritDoc} */
- @Override
- public PObject<Map<K, V>> asMap() {
- return new MapPObject<K, V>(this);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mem/collect/Shuffler.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mem/collect/Shuffler.java b/crunch/src/main/java/org/apache/crunch/impl/mem/collect/Shuffler.java
deleted file mode 100644
index 2e8f9eb..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mem/collect/Shuffler.java
+++ /dev/null
@@ -1,149 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mem.collect;
-
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.TreeMap;
-
-import org.apache.crunch.GroupingOptions;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.impl.SingleUseIterable;
-import org.apache.crunch.types.PType;
-import org.apache.hadoop.io.RawComparator;
-import org.apache.hadoop.util.ReflectionUtils;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Iterators;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-/**
- * In-memory versions of common MapReduce patterns for aggregating key-value data.
- */
-abstract class Shuffler<K, V> implements Iterable<Pair<K, Iterable<V>>> {
-
- public abstract void add(Pair<K, V> record);
-
- private static <K, V> Map<K, V> getMapForKeyType(PType<?> ptype) {
- if (ptype != null && Comparable.class.isAssignableFrom(ptype.getTypeClass())) {
- return new TreeMap<K, V>();
- } else {
- return Maps.newHashMap();
- }
- }
-
- public static <S, T> Shuffler<S, T> create(PType<S> keyType, GroupingOptions options,
- Pipeline pipeline) {
- Map<S, Collection<T>> map = getMapForKeyType(keyType);
-
- if (options != null) {
- if (Pair.class.equals(keyType.getTypeClass()) && options.getGroupingComparatorClass() != null) {
- PType<?> pairKey = keyType.getSubTypes().get(0);
- return new SecondarySortShuffler(getMapForKeyType(pairKey));
- } else if (options.getSortComparatorClass() != null) {
- RawComparator<S> rc = ReflectionUtils.newInstance(options.getSortComparatorClass(),
- pipeline.getConfiguration());
- map = new TreeMap<S, Collection<T>>(rc);
- }
- }
-
- return new MapShuffler<S, T>(map);
- }
-
- private static class HFunction<K, V> implements Function<Map.Entry<K, Collection<V>>, Pair<K, Iterable<V>>> {
- @Override
- public Pair<K, Iterable<V>> apply(Map.Entry<K, Collection<V>> input) {
- return Pair.<K, Iterable<V>>of(input.getKey(), new SingleUseIterable<V>(input.getValue()));
- }
- }
-
- private static class MapShuffler<K, V> extends Shuffler<K, V> {
- private final Map<K, Collection<V>> map;
-
- public MapShuffler(Map<K, Collection<V>> map) {
- this.map = map;
- }
-
- @Override
- public Iterator<Pair<K, Iterable<V>>> iterator() {
- return Iterators.transform(map.entrySet().iterator(),
- new HFunction<K, V>());
- }
-
- @Override
- public void add(Pair<K, V> record) {
- if (!map.containsKey(record.first())) {
- Collection<V> values = Lists.newArrayList();
- map.put(record.first(), values);
- }
- map.get(record.first()).add(record.second());
- }
- }
-
- private static class SSFunction<K, SK, V> implements
- Function<Map.Entry<K, List<Pair<SK, V>>>, Pair<Pair<K, SK>, Iterable<V>>> {
- @Override
- public Pair<Pair<K, SK>, Iterable<V>> apply(Entry<K, List<Pair<SK, V>>> input) {
- List<Pair<SK, V>> values = input.getValue();
- Collections.sort(values, new Comparator<Pair<SK, V>>() {
- @Override
- public int compare(Pair<SK, V> o1, Pair<SK, V> o2) {
- return ((Comparable) o1.first()).compareTo(o2.first());
- }
- });
- Pair<K, SK> key = Pair.of(input.getKey(), values.get(0).first());
- return Pair.of(key, Iterables.transform(values, new Function<Pair<SK, V>, V>() {
- @Override
- public V apply(Pair<SK, V> input) {
- return input.second();
- }
- }));
- }
- }
-
- private static class SecondarySortShuffler<K, SK, V> extends Shuffler<Pair<K, SK>, V> {
-
- private Map<K, List<Pair<SK, V>>> map;
-
- public SecondarySortShuffler(Map<K, List<Pair<SK, V>>> map) {
- this.map = map;
- }
-
- @Override
- public Iterator<Pair<Pair<K, SK>, Iterable<V>>> iterator() {
- return Iterators.transform(map.entrySet().iterator(), new SSFunction<K, SK, V>());
- }
-
- @Override
- public void add(Pair<Pair<K, SK>, V> record) {
- K primary = record.first().first();
- if (!map.containsKey(primary)) {
- map.put(primary, Lists.<Pair<SK, V>>newArrayList());
- }
- map.get(primary).add(Pair.of(record.first().second(), record.second()));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mem/emit/InMemoryEmitter.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mem/emit/InMemoryEmitter.java b/crunch/src/main/java/org/apache/crunch/impl/mem/emit/InMemoryEmitter.java
deleted file mode 100644
index 6976615..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mem/emit/InMemoryEmitter.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mem.emit;
-
-import java.util.List;
-
-import org.apache.crunch.Emitter;
-
-import com.google.common.collect.Lists;
-
-/**
- * An {@code Emitter} instance that writes emitted records to a backing
- * {@code List}.
- *
- * @param <T>
- */
-public class InMemoryEmitter<T> implements Emitter<T> {
-
- private final List<T> output;
-
- public InMemoryEmitter() {
- this(Lists.<T> newArrayList());
- }
-
- public InMemoryEmitter(List<T> output) {
- this.output = output;
- }
-
- @Override
- public void emit(T emitted) {
- output.add(emitted);
- }
-
- @Override
- public void flush() {
-
- }
-
- public List<T> getOutput() {
- return output;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mem/package-info.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mem/package-info.java b/crunch/src/main/java/org/apache/crunch/impl/mem/package-info.java
deleted file mode 100644
index a55b673..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mem/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * In-memory Pipeline implementation for rapid prototyping and testing.
- */
-package org.apache.crunch.impl.mem;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/MRPipeline.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/MRPipeline.java b/crunch/src/main/java/org/apache/crunch/impl/mr/MRPipeline.java
deleted file mode 100644
index 00cf486..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/MRPipeline.java
+++ /dev/null
@@ -1,396 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr;
-
-import java.io.IOException;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.PipelineExecution;
-import org.apache.crunch.PipelineResult;
-import org.apache.crunch.Source;
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.TableSource;
-import org.apache.crunch.Target;
-import org.apache.crunch.Target.WriteMode;
-import org.apache.crunch.fn.IdentityFn;
-import org.apache.crunch.impl.mr.collect.InputCollection;
-import org.apache.crunch.impl.mr.collect.InputTable;
-import org.apache.crunch.impl.mr.collect.PCollectionImpl;
-import org.apache.crunch.impl.mr.collect.PGroupedTableImpl;
-import org.apache.crunch.impl.mr.collect.UnionCollection;
-import org.apache.crunch.impl.mr.collect.UnionTable;
-import org.apache.crunch.impl.mr.exec.MRExecutor;
-import org.apache.crunch.impl.mr.plan.MSCRPlanner;
-import org.apache.crunch.impl.mr.run.RuntimeParameters;
-import org.apache.crunch.io.From;
-import org.apache.crunch.io.ReadableSource;
-import org.apache.crunch.io.ReadableSourceTarget;
-import org.apache.crunch.io.To;
-import org.apache.crunch.materialize.MaterializableIterable;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.writable.Writables;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-
-import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
-
-/**
- * Pipeline implementation that is executed within Hadoop MapReduce.
- */
-public class MRPipeline implements Pipeline {
-
- private static final Log LOG = LogFactory.getLog(MRPipeline.class);
-
- private static final Random RANDOM = new Random();
-
- private final Class<?> jarClass;
- private final String name;
- private final Map<PCollectionImpl<?>, Set<Target>> outputTargets;
- private final Map<PCollectionImpl<?>, MaterializableIterable<?>> outputTargetsToMaterialize;
- private Path tempDirectory;
- private int tempFileIndex;
- private int nextAnonymousStageId;
-
- private Configuration conf;
-
- /**
- * Instantiate with a default Configuration and name.
- *
- * @param jarClass Class containing the main driver method for running the pipeline
- */
- public MRPipeline(Class<?> jarClass) {
- this(jarClass, new Configuration());
- }
-
- /**
- * Instantiate with a custom pipeline name. The name will be displayed in the Hadoop JobTracker.
- *
- * @param jarClass Class containing the main driver method for running the pipeline
- * @param name Display name of the pipeline
- */
- public MRPipeline(Class<?> jarClass, String name) {
- this(jarClass, name, new Configuration());
- }
-
- /**
- * Instantiate with a custom configuration and default naming.
- *
- * @param jarClass Class containing the main driver method for running the pipeline
- * @param conf Configuration to be used within all MapReduce jobs run in the pipeline
- */
- public MRPipeline(Class<?> jarClass, Configuration conf) {
- this(jarClass, jarClass.getName(), conf);
- }
-
- /**
- * Instantiate with a custom name and configuration. The name will be displayed in the Hadoop
- * JobTracker.
- *
- * @param jarClass Class containing the main driver method for running the pipeline
- * @param name Display name of the pipeline
- * @param conf Configuration to be used within all MapReduce jobs run in the pipeline
- */
- public MRPipeline(Class<?> jarClass, String name, Configuration conf) {
- this.jarClass = jarClass;
- this.name = name;
- this.outputTargets = Maps.newHashMap();
- this.outputTargetsToMaterialize = Maps.newHashMap();
- this.conf = conf;
- this.tempDirectory = createTempDirectory(conf);
- this.tempFileIndex = 0;
- this.nextAnonymousStageId = 0;
- }
-
- @Override
- public Configuration getConfiguration() {
- return conf;
- }
-
- @Override
- public void setConfiguration(Configuration conf) {
- this.conf = conf;
- this.tempDirectory = createTempDirectory(conf);
- }
-
- public MRExecutor plan() {
- Map<PCollectionImpl<?>, MaterializableIterable> toMaterialize = Maps.newHashMap();
- for (PCollectionImpl<?> c : outputTargets.keySet()) {
- if (outputTargetsToMaterialize.containsKey(c)) {
- toMaterialize.put(c, outputTargetsToMaterialize.get(c));
- outputTargetsToMaterialize.remove(c);
- }
- }
- MSCRPlanner planner = new MSCRPlanner(this, outputTargets, toMaterialize);
- try {
- return planner.plan(jarClass, conf);
- } catch (IOException e) {
- throw new CrunchRuntimeException(e);
- }
- }
-
- @Override
- public PipelineResult run() {
- try {
- PipelineExecution pipelineExecution = runAsync();
- pipelineExecution.waitUntilDone();
- return pipelineExecution.getResult();
- } catch (InterruptedException e) {
- // TODO: How to handle this without changing signature?
- LOG.error("Exception running pipeline", e);
- return PipelineResult.EMPTY;
- }
- }
-
- @Override
- public PipelineExecution runAsync() {
- PipelineExecution res = plan().execute();
- outputTargets.clear();
- return res;
- }
-
- @Override
- public PipelineResult done() {
- PipelineResult res = null;
- if (!outputTargets.isEmpty()) {
- res = run();
- }
- cleanup();
- return res;
- }
-
- public <S> PCollection<S> read(Source<S> source) {
- return new InputCollection<S>(source, this);
- }
-
- public <K, V> PTable<K, V> read(TableSource<K, V> source) {
- return new InputTable<K, V>(source, this);
- }
-
- public PCollection<String> readTextFile(String pathName) {
- return read(From.textFile(pathName));
- }
-
- public void write(PCollection<?> pcollection, Target target) {
- write(pcollection, target, Target.WriteMode.DEFAULT);
- }
-
- @SuppressWarnings("unchecked")
- public void write(PCollection<?> pcollection, Target target,
- Target.WriteMode writeMode) {
- if (pcollection instanceof PGroupedTableImpl) {
- pcollection = ((PGroupedTableImpl<?, ?>) pcollection).ungroup();
- } else if (pcollection instanceof UnionCollection || pcollection instanceof UnionTable) {
- pcollection = pcollection.parallelDo("UnionCollectionWrapper",
- (MapFn) IdentityFn.<Object> getInstance(), pcollection.getPType());
- }
- target.handleExisting(writeMode, getConfiguration());
- if (writeMode != WriteMode.APPEND && targetInCurrentRun(target)) {
- throw new CrunchRuntimeException("Target " + target + " is already written in current run." +
- " Use WriteMode.APPEND in order to write additional data to it.");
- }
- addOutput((PCollectionImpl<?>) pcollection, target);
- }
-
- private boolean targetInCurrentRun(Target target) {
- for (Set<Target> targets : outputTargets.values()) {
- if (targets.contains(target)) {
- return true;
- }
- }
- return false;
- }
-
- private void addOutput(PCollectionImpl<?> impl, Target target) {
- if (!outputTargets.containsKey(impl)) {
- outputTargets.put(impl, Sets.<Target> newHashSet());
- }
- outputTargets.get(impl).add(target);
- }
-
- @Override
- public <T> Iterable<T> materialize(PCollection<T> pcollection) {
-
- PCollectionImpl<T> pcollectionImpl = toPcollectionImpl(pcollection);
- ReadableSource<T> readableSrc = getMaterializeSourceTarget(pcollectionImpl);
-
- MaterializableIterable<T> c = new MaterializableIterable<T>(this, readableSrc);
- if (!outputTargetsToMaterialize.containsKey(pcollectionImpl)) {
- outputTargetsToMaterialize.put(pcollectionImpl, c);
- }
- return c;
- }
-
- /**
- * Retrieve a ReadableSourceTarget that provides access to the contents of a {@link PCollection}.
- * This is primarily intended as a helper method to {@link #materialize(PCollection)}. The
- * underlying data of the ReadableSourceTarget may not be actually present until the pipeline is
- * run.
- *
- * @param pcollection The collection for which the ReadableSourceTarget is to be retrieved
- * @return The ReadableSourceTarget
- * @throws IllegalArgumentException If no ReadableSourceTarget can be retrieved for the given
- * PCollection
- */
- public <T> ReadableSource<T> getMaterializeSourceTarget(PCollection<T> pcollection) {
- PCollectionImpl<T> impl = toPcollectionImpl(pcollection);
-
- // First, check to see if this is a readable input collection.
- if (impl instanceof InputCollection) {
- InputCollection<T> ic = (InputCollection<T>) impl;
- if (ic.getSource() instanceof ReadableSource) {
- return (ReadableSource) ic.getSource();
- } else {
- throw new IllegalArgumentException(
- "Cannot materialize non-readable input collection: " + ic);
- }
- } else if (impl instanceof InputTable) {
- InputTable it = (InputTable) impl;
- if (it.getSource() instanceof ReadableSource) {
- return (ReadableSource) it.getSource();
- } else {
- throw new IllegalArgumentException(
- "Cannot materialize non-readable input table: " + it);
- }
- }
-
- // Next, check to see if this pcollection has already been materialized.
- SourceTarget<T> matTarget = impl.getMaterializedAt();
- if (matTarget != null && matTarget instanceof ReadableSourceTarget) {
- return (ReadableSourceTarget<T>) matTarget;
- }
-
- // Check to see if we plan on materializing this collection on the
- // next run.
- ReadableSourceTarget<T> srcTarget = null;
- if (outputTargets.containsKey(pcollection)) {
- for (Target target : outputTargets.get(impl)) {
- if (target instanceof ReadableSourceTarget) {
- return (ReadableSourceTarget<T>) target;
- }
- }
- }
-
- // If we're not planning on materializing it already, create a temporary
- // output to hold the materialized records and return that.
- SourceTarget<T> st = createIntermediateOutput(pcollection.getPType());
- if (!(st instanceof ReadableSourceTarget)) {
- throw new IllegalArgumentException("The PType for the given PCollection is not readable"
- + " and cannot be materialized");
- } else {
- srcTarget = (ReadableSourceTarget<T>) st;
- addOutput(impl, srcTarget);
- return srcTarget;
- }
- }
-
- /**
- * Safely cast a PCollection into a PCollectionImpl, including handling the case of
- * UnionCollections.
- *
- * @param pcollection The PCollection to be cast/transformed
- * @return The PCollectionImpl representation
- */
- private <T> PCollectionImpl<T> toPcollectionImpl(PCollection<T> pcollection) {
- PCollectionImpl<T> pcollectionImpl = null;
- if (pcollection instanceof UnionCollection || pcollection instanceof UnionTable) {
- pcollectionImpl = (PCollectionImpl<T>) pcollection.parallelDo("UnionCollectionWrapper",
- (MapFn) IdentityFn.<Object> getInstance(), pcollection.getPType());
- } else {
- pcollectionImpl = (PCollectionImpl<T>) pcollection;
- }
- return pcollectionImpl;
- }
-
- public <T> SourceTarget<T> createIntermediateOutput(PType<T> ptype) {
- return ptype.getDefaultFileSource(createTempPath());
- }
-
- public Path createTempPath() {
- tempFileIndex++;
- return new Path(tempDirectory, "p" + tempFileIndex);
- }
-
- private static Path createTempDirectory(Configuration conf) {
- Path dir = createTemporaryPath(conf);
- try {
- dir.getFileSystem(conf).mkdirs(dir);
- } catch (IOException e) {
- throw new RuntimeException("Cannot create job output directory " + dir, e);
- }
- return dir;
- }
-
- private static Path createTemporaryPath(Configuration conf) {
- String baseDir = conf.get(RuntimeParameters.TMP_DIR, "/tmp");
- return new Path(baseDir, "crunch-" + (RANDOM.nextInt() & Integer.MAX_VALUE));
- }
-
- @Override
- public <T> void writeTextFile(PCollection<T> pcollection, String pathName) {
- pcollection.parallelDo("asText", new StringifyFn<T>(), Writables.strings())
- .write(To.textFile(pathName));
- }
-
- private static class StringifyFn<T> extends MapFn<T, String> {
- @Override
- public String map(T input) {
- return input.toString();
- }
- }
-
- private void cleanup() {
- if (!outputTargets.isEmpty()) {
- LOG.warn("Not running cleanup while output targets remain");
- return;
- }
- try {
- FileSystem fs = tempDirectory.getFileSystem(conf);
- if (fs.exists(tempDirectory)) {
- fs.delete(tempDirectory, true);
- }
- } catch (IOException e) {
- LOG.info("Exception during cleanup", e);
- }
- }
-
- public int getNextAnonymousStageId() {
- return nextAnonymousStageId++;
- }
-
- @Override
- public void enableDebug() {
- // Turn on Crunch runtime error catching.
- getConfiguration().setBoolean(RuntimeParameters.DEBUG, true);
- }
-
- @Override
- public String getName() {
- return name;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/collect/DoCollectionImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/DoCollectionImpl.java b/crunch/src/main/java/org/apache/crunch/impl/mr/collect/DoCollectionImpl.java
deleted file mode 100644
index 7b8f2ea..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/DoCollectionImpl.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.collect;
-
-import java.util.List;
-import java.util.Set;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.ParallelDoOptions;
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.impl.mr.plan.DoNode;
-import org.apache.crunch.types.PType;
-
-import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableSet;
-
-public class DoCollectionImpl<S> extends PCollectionImpl<S> {
-
- private final PCollectionImpl<Object> parent;
- private final DoFn<Object, S> fn;
- private final PType<S> ntype;
-
- <T> DoCollectionImpl(String name, PCollectionImpl<T> parent, DoFn<T, S> fn, PType<S> ntype) {
- this(name, parent, fn, ntype, ParallelDoOptions.builder().build());
- }
-
- <T> DoCollectionImpl(String name, PCollectionImpl<T> parent, DoFn<T, S> fn, PType<S> ntype,
- ParallelDoOptions options) {
- super(name, options);
- this.parent = (PCollectionImpl<Object>) parent;
- this.fn = (DoFn<Object, S>) fn;
- this.ntype = ntype;
- }
-
- @Override
- protected long getSizeInternal() {
- return (long) (fn.scaleFactor() * parent.getSize());
- }
-
- @Override
- public PType<S> getPType() {
- return ntype;
- }
-
- @Override
- protected void acceptInternal(PCollectionImpl.Visitor visitor) {
- visitor.visitDoFnCollection(this);
- }
-
- @Override
- public List<PCollectionImpl<?>> getParents() {
- return ImmutableList.<PCollectionImpl<?>> of(parent);
- }
-
- @Override
- public DoNode createDoNode() {
- return DoNode.createFnNode(getName(), fn, ntype);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/collect/DoTableImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/DoTableImpl.java b/crunch/src/main/java/org/apache/crunch/impl/mr/collect/DoTableImpl.java
deleted file mode 100644
index 176643b..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/DoTableImpl.java
+++ /dev/null
@@ -1,84 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.collect;
-
-import java.util.List;
-
-import org.apache.crunch.CombineFn;
-import org.apache.crunch.DoFn;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.ParallelDoOptions;
-import org.apache.crunch.impl.mr.plan.DoNode;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-
-import com.google.common.collect.ImmutableList;
-
-public class DoTableImpl<K, V> extends PTableBase<K, V> implements PTable<K, V> {
-
- private final PCollectionImpl<?> parent;
- private final DoFn<?, Pair<K, V>> fn;
- private final PTableType<K, V> type;
-
- <S> DoTableImpl(String name, PCollectionImpl<S> parent, DoFn<S, Pair<K, V>> fn, PTableType<K, V> ntype) {
- this(name, parent, fn, ntype, ParallelDoOptions.builder().build());
- }
-
- <S> DoTableImpl(String name, PCollectionImpl<S> parent, DoFn<S, Pair<K, V>> fn, PTableType<K, V> ntype,
- ParallelDoOptions options) {
- super(name, options);
- this.parent = parent;
- this.fn = fn;
- this.type = ntype;
- }
-
- @Override
- protected long getSizeInternal() {
- return (long) (fn.scaleFactor() * parent.getSize());
- }
-
- @Override
- public PTableType<K, V> getPTableType() {
- return type;
- }
-
- @Override
- protected void acceptInternal(PCollectionImpl.Visitor visitor) {
- visitor.visitDoTable(this);
- }
-
- @Override
- public PType<Pair<K, V>> getPType() {
- return type;
- }
-
- @Override
- public List<PCollectionImpl<?>> getParents() {
- return ImmutableList.<PCollectionImpl<?>> of(parent);
- }
-
- @Override
- public DoNode createDoNode() {
- return DoNode.createFnNode(getName(), fn, type);
- }
-
- public boolean hasCombineFn() {
- return fn instanceof CombineFn;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/collect/InputCollection.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/InputCollection.java b/crunch/src/main/java/org/apache/crunch/impl/mr/collect/InputCollection.java
deleted file mode 100644
index ace5cc1..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/InputCollection.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.collect;
-
-import java.util.List;
-
-import org.apache.commons.lang.builder.HashCodeBuilder;
-import org.apache.crunch.Source;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.impl.mr.plan.DoNode;
-import org.apache.crunch.types.PType;
-
-import com.google.common.collect.ImmutableList;
-
-public class InputCollection<S> extends PCollectionImpl<S> {
-
- private final Source<S> source;
-
- public InputCollection(Source<S> source, MRPipeline pipeline) {
- super(source.toString());
- this.source = source;
- this.pipeline = pipeline;
- }
-
- @Override
- public PType<S> getPType() {
- return source.getType();
- }
-
- public Source<S> getSource() {
- return source;
- }
-
- @Override
- protected long getSizeInternal() {
- long sz = source.getSize(pipeline.getConfiguration());
- if (sz < 0) {
- throw new IllegalStateException("Input source " + source + " does not exist!");
- }
- return sz;
- }
-
- @Override
- protected void acceptInternal(PCollectionImpl.Visitor visitor) {
- visitor.visitInputCollection(this);
- }
-
- @Override
- public List<PCollectionImpl<?>> getParents() {
- return ImmutableList.of();
- }
-
- @Override
- public DoNode createDoNode() {
- return DoNode.createInputNode(source);
- }
-
- @Override
- public boolean equals(Object obj) {
- if (obj == null || !(obj instanceof InputCollection)) {
- return false;
- }
- return source.equals(((InputCollection) obj).source);
- }
-
- @Override
- public int hashCode() {
- return new HashCodeBuilder().append(source).toHashCode();
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/impl/mr/collect/InputTable.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/InputTable.java b/crunch/src/main/java/org/apache/crunch/impl/mr/collect/InputTable.java
deleted file mode 100644
index 71f11c5..0000000
--- a/crunch/src/main/java/org/apache/crunch/impl/mr/collect/InputTable.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.impl.mr.collect;
-
-import java.util.List;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.TableSource;
-import org.apache.crunch.impl.mr.MRPipeline;
-import org.apache.crunch.impl.mr.plan.DoNode;
-import org.apache.crunch.types.PTableType;
-import org.apache.crunch.types.PType;
-
-import com.google.common.collect.ImmutableList;
-
-public class InputTable<K, V> extends PTableBase<K, V> {
-
- private final TableSource<K, V> source;
- private final InputCollection<Pair<K, V>> asCollection;
-
- public InputTable(TableSource<K, V> source, MRPipeline pipeline) {
- super(source.toString());
- this.source = source;
- this.pipeline = pipeline;
- this.asCollection = new InputCollection<Pair<K, V>>(source, pipeline);
- }
-
- public TableSource<K, V> getSource() {
- return source;
- }
-
- @Override
- protected long getSizeInternal() {
- return asCollection.getSizeInternal();
- }
-
- @Override
- public PTableType<K, V> getPTableType() {
- return source.getTableType();
- }
-
- @Override
- public PType<Pair<K, V>> getPType() {
- return source.getType();
- }
-
- @Override
- public List<PCollectionImpl<?>> getParents() {
- return ImmutableList.of();
- }
-
- @Override
- protected void acceptInternal(PCollectionImpl.Visitor visitor) {
- visitor.visitInputCollection(asCollection);
- }
-
- @Override
- public DoNode createDoNode() {
- return DoNode.createInputNode(source);
- }
-
- @Override
- public int hashCode() {
- return asCollection.hashCode();
- }
-
- @Override
- public boolean equals(Object other) {
- return asCollection.equals(other);
- }
-}
[07/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/join/InnerJoinFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/join/InnerJoinFn.java b/crunch/src/main/java/org/apache/crunch/lib/join/InnerJoinFn.java
deleted file mode 100644
index a3d30d2..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/join/InnerJoinFn.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import java.util.List;
-
-import org.apache.crunch.Emitter;
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.PType;
-
-import com.google.common.collect.Lists;
-
-/**
- * Used to perform the last step of an inner join.
- *
- * @param <K> Type of the keys.
- * @param <U> Type of the first {@link org.apache.crunch.PTable}'s values
- * @param <V> Type of the second {@link org.apache.crunch.PTable}'s values
- */
-public class InnerJoinFn<K, U, V> extends JoinFn<K, U, V> {
-
- private transient K lastKey;
- private transient List<U> leftValues;
-
- public InnerJoinFn(PType<K> keyType, PType<U> leftValueType) {
- super(keyType, leftValueType);
- }
-
- /** {@inheritDoc} */
- @Override
- public void initialize() {
- super.initialize();
- lastKey = null;
- this.leftValues = Lists.newArrayList();
- }
-
- /** {@inheritDoc} */
- @Override
- public void join(K key, int id, Iterable<Pair<U, V>> pairs, Emitter<Pair<K, Pair<U, V>>> emitter) {
- if (!key.equals(lastKey)) {
- lastKey = keyType.getDetachedValue(key);
- leftValues.clear();
- }
- if (id == 0) { // from left
- for (Pair<U, V> pair : pairs) {
- if (pair.first() != null)
- leftValues.add(leftValueType.getDetachedValue(pair.first()));
- }
- } else { // from right
- for (Pair<U, V> pair : pairs) {
- for (U u : leftValues) {
- emitter.emit(Pair.of(lastKey, Pair.of(u, pair.second())));
- }
- }
- }
- }
-
- /** {@inheritDoc} */
- @Override
- public String getJoinType() {
- return "innerJoin";
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/join/JoinFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/join/JoinFn.java b/crunch/src/main/java/org/apache/crunch/lib/join/JoinFn.java
deleted file mode 100644
index 99aea5a..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/join/JoinFn.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.PType;
-
-/**
- * Represents a {@link org.apache.crunch.DoFn} for performing joins.
- *
- * @param <K> Type of the keys.
- * @param <U> Type of the first {@link org.apache.crunch.PTable}'s values
- * @param <V> Type of the second {@link org.apache.crunch.PTable}'s values
- */
-public abstract class JoinFn<K, U, V> extends
- DoFn<Pair<Pair<K, Integer>, Iterable<Pair<U, V>>>, Pair<K, Pair<U, V>>> {
-
- protected PType<K> keyType;
- protected PType<U> leftValueType;
-
- /**
- * Instantiate with the PType of the value of the left side of the join (used for creating deep
- * copies of values).
- *
- * @param keyType The PType of the value used as the key of the join
- * @param leftValueType The PType of the value type of the left side of the join
- */
- public JoinFn(PType<K> keyType, PType<U> leftValueType) {
- this.keyType = keyType;
- this.leftValueType = leftValueType;
- }
-
- @Override
- public void initialize() {
- this.keyType.initialize(getConfiguration());
- this.leftValueType.initialize(getConfiguration());
- }
-
- /** @return The name of this join type (e.g. innerJoin, leftOuterJoin). */
- public abstract String getJoinType();
-
- /**
- * Performs the actual joining.
- *
- * @param key The key for this grouping of values.
- * @param id The side that this group of values is from (0 -> left, 1 -> right).
- * @param pairs The group of values associated with this key and id pair.
- * @param emitter The emitter to send the output to.
- */
- public abstract void join(K key, int id, Iterable<Pair<U, V>> pairs,
- Emitter<Pair<K, Pair<U, V>>> emitter);
-
- /**
- * Split up the input record to make coding a bit more manageable.
- *
- * @param input The input record.
- * @param emitter The emitter to send the output to.
- */
- @Override
- public void process(Pair<Pair<K, Integer>, Iterable<Pair<U, V>>> input,
- Emitter<Pair<K, Pair<U, V>>> emitter) {
- join(input.first().first(), input.first().second(), input.second(), emitter);
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/join/JoinUtils.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/join/JoinUtils.java b/crunch/src/main/java/org/apache/crunch/lib/join/JoinUtils.java
deleted file mode 100644
index 6efeccb..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/join/JoinUtils.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import org.apache.avro.Schema;
-import org.apache.avro.generic.IndexedRecord;
-import org.apache.avro.io.BinaryData;
-import org.apache.avro.mapred.AvroJob;
-import org.apache.avro.mapred.AvroKey;
-import org.apache.avro.mapred.AvroValue;
-import org.apache.avro.mapred.AvroWrapper;
-import org.apache.avro.reflect.ReflectData;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.writable.TupleWritable;
-import org.apache.crunch.types.writable.WritableTypeFamily;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.DataInputBuffer;
-import org.apache.hadoop.io.RawComparator;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.mapreduce.Partitioner;
-
-/**
- * Utilities that are useful in joining multiple data sets via a MapReduce.
- *
- */
-public class JoinUtils {
-
- public static Class<? extends Partitioner> getPartitionerClass(PTypeFamily typeFamily) {
- if (typeFamily == WritableTypeFamily.getInstance()) {
- return TupleWritablePartitioner.class;
- } else {
- return AvroIndexedRecordPartitioner.class;
- }
- }
-
- public static Class<? extends RawComparator> getGroupingComparator(PTypeFamily typeFamily) {
- if (typeFamily == WritableTypeFamily.getInstance()) {
- return TupleWritableComparator.class;
- } else {
- return AvroPairGroupingComparator.class;
- }
- }
-
- public static class TupleWritablePartitioner extends Partitioner<TupleWritable, Writable> {
- @Override
- public int getPartition(TupleWritable key, Writable value, int numPartitions) {
- return (Math.abs(key.get(0).hashCode()) & Integer.MAX_VALUE) % numPartitions;
- }
- }
-
- public static class TupleWritableComparator implements RawComparator<TupleWritable> {
-
- private DataInputBuffer buffer = new DataInputBuffer();
- private TupleWritable key1 = new TupleWritable();
- private TupleWritable key2 = new TupleWritable();
-
- @Override
- public int compare(TupleWritable o1, TupleWritable o2) {
- return ((WritableComparable) o1.get(0)).compareTo((WritableComparable) o2.get(0));
- }
-
- @Override
- public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- try {
- buffer.reset(b1, s1, l1);
- key1.readFields(buffer);
-
- buffer.reset(b2, s2, l2);
- key2.readFields(buffer);
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
-
- return compare(key1, key2);
- }
- }
-
- public static class AvroIndexedRecordPartitioner<K, V> extends Partitioner<AvroKey<K>, AvroValue<V>> {
- @Override
- public int getPartition(AvroKey<K> key, AvroValue<V> value, int numPartitions) {
- IndexedRecord record = (IndexedRecord) key.datum();
- return (Math.abs(record.get(0).hashCode()) & Integer.MAX_VALUE) % numPartitions;
- }
- }
-
- public static class AvroPairGroupingComparator<T> extends Configured implements RawComparator<AvroWrapper<T>> {
- private Schema schema;
-
- @Override
- public void setConf(Configuration conf) {
- super.setConf(conf);
- if (conf != null) {
- Schema mapOutputSchema = AvroJob.getMapOutputSchema(conf);
- Schema keySchema = org.apache.avro.mapred.Pair.getKeySchema(mapOutputSchema);
- schema = keySchema.getFields().get(0).schema();
- }
- }
-
- @Override
- public int compare(AvroWrapper<T> x, AvroWrapper<T> y) {
- return ReflectData.get().compare(x.datum(), y.datum(), schema);
- }
-
- @Override
- public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- return BinaryData.compare(b1, s1, l1, b2, s2, l2, schema);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/join/LeftOuterJoinFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/join/LeftOuterJoinFn.java b/crunch/src/main/java/org/apache/crunch/lib/join/LeftOuterJoinFn.java
deleted file mode 100644
index 731c496..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/join/LeftOuterJoinFn.java
+++ /dev/null
@@ -1,98 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import java.util.List;
-
-import org.apache.crunch.Emitter;
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.PType;
-
-import com.google.common.collect.Lists;
-
-/**
- * Used to perform the last step of an left outer join.
- *
- * @param <K> Type of the keys.
- * @param <U> Type of the first {@link org.apache.crunch.PTable}'s values
- * @param <V> Type of the second {@link org.apache.crunch.PTable}'s values
- */
-public class LeftOuterJoinFn<K, U, V> extends JoinFn<K, U, V> {
-
- private transient int lastId;
- private transient K lastKey;
- private transient List<U> leftValues;
-
- public LeftOuterJoinFn(PType<K> keyType, PType<U> leftValueType) {
- super(keyType, leftValueType);
- }
-
- /** {@inheritDoc} */
- @Override
- public void initialize() {
- super.initialize();
- lastId = 1;
- lastKey = null;
- this.leftValues = Lists.newArrayList();
- }
-
- /** {@inheritDoc} */
- @Override
- public void join(K key, int id, Iterable<Pair<U, V>> pairs, Emitter<Pair<K, Pair<U, V>>> emitter) {
- if (!key.equals(lastKey)) {
- // Make sure that left side always gets emitted.
- if (0 == lastId) {
- for (U u : leftValues) {
- emitter.emit(Pair.of(lastKey, Pair.of(u, (V) null)));
- }
- }
- lastKey = keyType.getDetachedValue(key);
- leftValues.clear();
- }
- if (id == 0) {
- for (Pair<U, V> pair : pairs) {
- if (pair.first() != null)
- leftValues.add(leftValueType.getDetachedValue(pair.first()));
- }
- } else {
- for (Pair<U, V> pair : pairs) {
- for (U u : leftValues) {
- emitter.emit(Pair.of(lastKey, Pair.of(u, pair.second())));
- }
- }
- }
-
- lastId = id;
- }
-
- /** {@inheritDoc} */
- @Override
- public void cleanup(Emitter<Pair<K, Pair<U, V>>> emitter) {
- if (0 == lastId) {
- for (U u : leftValues) {
- emitter.emit(Pair.of(lastKey, Pair.of(u, (V) null)));
- }
- }
- }
-
- /** {@inheritDoc} */
- @Override
- public String getJoinType() {
- return "leftOuterJoin";
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/join/MapsideJoin.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/join/MapsideJoin.java b/crunch/src/main/java/org/apache/crunch/lib/join/MapsideJoin.java
deleted file mode 100644
index 56476c1..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/join/MapsideJoin.java
+++ /dev/null
@@ -1,164 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import java.io.IOException;
-
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.DoFn;
-import org.apache.crunch.Emitter;
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.ParallelDoOptions;
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.io.ReadableSourceTarget;
-import org.apache.crunch.materialize.MaterializableIterable;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.util.DistCache;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-
-import com.google.common.collect.ArrayListMultimap;
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Multimap;
-
-/**
- * Utility for doing map side joins on a common key between two {@link PTable}s.
- * <p>
- * A map side join is an optimized join which doesn't use a reducer; instead,
- * the right side of the join is loaded into memory and the join is performed in
- * a mapper. This style of join has the important implication that the output of
- * the join is not sorted, which is the case with a conventional (reducer-based)
- * join.
- * <p>
- * <b>Note:</b>This utility is only supported when running with a
- * {@link MRPipeline} as the pipeline.
- */
-public class MapsideJoin {
-
- /**
- * Join two tables using a map side join. The right-side table will be loaded
- * fully in memory, so this method should only be used if the right side
- * table's contents can fit in the memory allocated to mappers. The join
- * performed by this method is an inner join.
- *
- * @param left
- * The left-side table of the join
- * @param right
- * The right-side table of the join, whose contents will be fully
- * read into memory
- * @return A table keyed on the join key, containing pairs of joined values
- */
- public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right) {
- PTypeFamily tf = left.getTypeFamily();
- Iterable<Pair<K, V>> iterable = right.materialize();
-
- if (iterable instanceof MaterializableIterable) {
- MaterializableIterable<Pair<K, V>> mi = (MaterializableIterable<Pair<K, V>>) iterable;
- MapsideJoinDoFn<K, U, V> mapJoinDoFn = new MapsideJoinDoFn<K, U, V>(mi.getPath().toString(),
- right.getPType());
- ParallelDoOptions.Builder optionsBuilder = ParallelDoOptions.builder();
- if (mi.isSourceTarget()) {
- optionsBuilder.sourceTargets((SourceTarget) mi.getSource());
- }
- return left.parallelDo("mapjoin", mapJoinDoFn,
- tf.tableOf(left.getKeyType(), tf.pairs(left.getValueType(), right.getValueType())),
- optionsBuilder.build());
- } else { // in-memory pipeline
- return left.parallelDo(new InMemoryJoinFn<K, U, V>(iterable),
- tf.tableOf(left.getKeyType(), tf.pairs(left.getValueType(), right.getValueType())));
- }
- }
-
- static class InMemoryJoinFn<K, U, V> extends DoFn<Pair<K, U>, Pair<K, Pair<U, V>>> {
-
- private Multimap<K, V> joinMap;
-
- public InMemoryJoinFn(Iterable<Pair<K, V>> iterable) {
- joinMap = HashMultimap.create();
- for (Pair<K, V> joinPair : iterable) {
- joinMap.put(joinPair.first(), joinPair.second());
- }
- }
-
- @Override
- public void process(Pair<K, U> input, Emitter<Pair<K, Pair<U, V>>> emitter) {
- K key = input.first();
- U value = input.second();
- for (V joinValue : joinMap.get(key)) {
- Pair<U, V> valuePair = Pair.of(value, joinValue);
- emitter.emit(Pair.of(key, valuePair));
- }
- }
- }
-
- static class MapsideJoinDoFn<K, U, V> extends DoFn<Pair<K, U>, Pair<K, Pair<U, V>>> {
-
- private String inputPath;
- private PType<Pair<K, V>> ptype;
- private Multimap<K, V> joinMap;
-
- public MapsideJoinDoFn(String inputPath, PType<Pair<K, V>> ptype) {
- this.inputPath = inputPath;
- this.ptype = ptype;
- }
-
- private Path getCacheFilePath() {
- Path local = DistCache.getPathToCacheFile(new Path(inputPath), getConfiguration());
- if (local == null) {
- throw new CrunchRuntimeException("Can't find local cache file for '" + inputPath + "'");
- }
- return local;
- }
-
- @Override
- public void configure(Configuration conf) {
- DistCache.addCacheFile(new Path(inputPath), conf);
- }
-
- @Override
- public void initialize() {
- super.initialize();
-
- ReadableSourceTarget<Pair<K, V>> sourceTarget = ptype.getDefaultFileSource(
- getCacheFilePath());
- Iterable<Pair<K, V>> iterable = null;
- try {
- iterable = sourceTarget.read(getConfiguration());
- } catch (IOException e) {
- throw new CrunchRuntimeException("Error reading right-side of map side join: ", e);
- }
-
- joinMap = ArrayListMultimap.create();
- for (Pair<K, V> joinPair : iterable) {
- joinMap.put(joinPair.first(), joinPair.second());
- }
- }
-
- @Override
- public void process(Pair<K, U> input, Emitter<Pair<K, Pair<U, V>>> emitter) {
- K key = input.first();
- U value = input.second();
- for (V joinValue : joinMap.get(key)) {
- Pair<U, V> valuePair = Pair.of(value, joinValue);
- emitter.emit(Pair.of(key, valuePair));
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/join/RightOuterJoinFn.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/join/RightOuterJoinFn.java b/crunch/src/main/java/org/apache/crunch/lib/join/RightOuterJoinFn.java
deleted file mode 100644
index 2789d40..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/join/RightOuterJoinFn.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.join;
-
-import java.util.List;
-
-import org.apache.crunch.Emitter;
-import org.apache.crunch.Pair;
-import org.apache.crunch.types.PType;
-
-import com.google.common.collect.Lists;
-
-/**
- * Used to perform the last step of an right outer join.
- *
- * @param <K> Type of the keys.
- * @param <U> Type of the first {@link org.apache.crunch.PTable}'s values
- * @param <V> Type of the second {@link org.apache.crunch.PTable}'s values
- */
-public class RightOuterJoinFn<K, U, V> extends JoinFn<K, U, V> {
-
- private transient K lastKey;
- private transient List<U> leftValues;
-
- public RightOuterJoinFn(PType<K> keyType, PType<U> leftValueType) {
- super(keyType, leftValueType);
- }
-
- /** {@inheritDoc} */
- @Override
- public void initialize() {
- super.initialize();
- lastKey = null;
- this.leftValues = Lists.newArrayList();
- }
-
- /** {@inheritDoc} */
- @Override
- public void join(K key, int id, Iterable<Pair<U, V>> pairs, Emitter<Pair<K, Pair<U, V>>> emitter) {
- if (!key.equals(lastKey)) {
- lastKey = keyType.getDetachedValue(key);
- leftValues.clear();
- }
- if (id == 0) {
- for (Pair<U, V> pair : pairs) {
- if (pair.first() != null)
- leftValues.add(leftValueType.getDetachedValue(pair.first()));
- }
- } else {
- for (Pair<U, V> pair : pairs) {
- // Make sure that right side gets emitted.
- if (leftValues.isEmpty()) {
- leftValues.add(null);
- }
-
- for (U u : leftValues) {
- emitter.emit(Pair.of(lastKey, Pair.of(u, pair.second())));
- }
- }
- }
- }
-
- /** {@inheritDoc} */
- @Override
- public String getJoinType() {
- return "rightOuterJoin";
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/join/package-info.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/join/package-info.java b/crunch/src/main/java/org/apache/crunch/lib/join/package-info.java
deleted file mode 100644
index f1ad9f1..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/join/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Inner and outer joins on collections.
- */
-package org.apache.crunch.lib.join;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/package-info.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/package-info.java b/crunch/src/main/java/org/apache/crunch/lib/package-info.java
deleted file mode 100644
index 2695787..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Joining, sorting, aggregating, and other commonly used functionality.
- */
-package org.apache.crunch.lib;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/sort/Comparators.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/sort/Comparators.java b/crunch/src/main/java/org/apache/crunch/lib/sort/Comparators.java
deleted file mode 100644
index ae7f49a..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/sort/Comparators.java
+++ /dev/null
@@ -1,187 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.sort;
-
-import java.util.Arrays;
-
-import org.apache.avro.Schema;
-import org.apache.avro.io.BinaryData;
-import org.apache.avro.mapred.AvroKey;
-import org.apache.avro.reflect.ReflectData;
-import org.apache.crunch.lib.Sort.ColumnOrder;
-import org.apache.crunch.lib.Sort.Order;
-import org.apache.crunch.types.writable.TupleWritable;
-import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.RawComparator;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.io.WritableComparator;
-import org.apache.hadoop.mapred.JobConf;
-
-import com.google.common.base.Function;
-import com.google.common.base.Joiner;
-import com.google.common.collect.Iterables;
-
-/**
- * A collection of {@code RawComparator<T>} implementations that are used by Crunch's {@code Sort} library.
- */
-public class Comparators {
-
- public static class ReverseWritableComparator<T> extends Configured implements RawComparator<T> {
-
- private RawComparator<T> comparator;
-
- @SuppressWarnings("unchecked")
- @Override
- public void setConf(Configuration conf) {
- super.setConf(conf);
- if (conf != null) {
- JobConf jobConf = new JobConf(conf);
- comparator = WritableComparator.get(jobConf.getMapOutputKeyClass().asSubclass(WritableComparable.class));
- }
- }
-
- @Override
- public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3, int arg4, int arg5) {
- return -comparator.compare(arg0, arg1, arg2, arg3, arg4, arg5);
- }
-
- @Override
- public int compare(T o1, T o2) {
- return -comparator.compare(o1, o2);
- }
- }
-
- public static class ReverseAvroComparator<T> extends Configured implements RawComparator<AvroKey<T>> {
-
- private Schema schema;
-
- @Override
- public void setConf(Configuration conf) {
- super.setConf(conf);
- if (conf != null) {
- schema = (new Schema.Parser()).parse(conf.get("crunch.schema"));
- }
- }
-
- @Override
- public int compare(AvroKey<T> o1, AvroKey<T> o2) {
- return -ReflectData.get().compare(o1.datum(), o2.datum(), schema);
- }
-
- @Override
- public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3, int arg4, int arg5) {
- return -BinaryData.compare(arg0, arg1, arg2, arg3, arg4, arg5, schema);
- }
- }
-
- public static class TupleWritableComparator extends WritableComparator implements Configurable {
-
- private static final String CRUNCH_ORDERING_PROPERTY = "crunch.ordering";
-
- private Configuration conf;
- private ColumnOrder[] columnOrders;
-
- public TupleWritableComparator() {
- super(TupleWritable.class, true);
- }
-
- public static void configureOrdering(Configuration conf, Order... orders) {
- conf.set(CRUNCH_ORDERING_PROPERTY,
- Joiner.on(",").join(Iterables.transform(Arrays.asList(orders), new Function<Order, String>() {
- @Override
- public String apply(Order o) {
- return o.name();
- }
- })));
- }
-
- public static void configureOrdering(Configuration conf, ColumnOrder... columnOrders) {
- conf.set(CRUNCH_ORDERING_PROPERTY,
- Joiner.on(",").join(Iterables.transform(Arrays.asList(columnOrders), new Function<ColumnOrder, String>() {
- @Override
- public String apply(ColumnOrder o) {
- return o.column() + ";" + o.order().name();
- }
- })));
- }
-
- @Override
- public int compare(WritableComparable a, WritableComparable b) {
- TupleWritable ta = (TupleWritable) a;
- TupleWritable tb = (TupleWritable) b;
- for (int index = 0; index < columnOrders.length; index++) {
- int order = 1;
- if (columnOrders[index].order() == Order.ASCENDING) {
- order = 1;
- } else if (columnOrders[index].order() == Order.DESCENDING) {
- order = -1;
- } else { // ignore
- continue;
- }
- if (!ta.has(index) && !tb.has(index)) {
- continue;
- } else if (ta.has(index) && !tb.has(index)) {
- return order;
- } else if (!ta.has(index) && tb.has(index)) {
- return -order;
- } else {
- Writable v1 = ta.get(index);
- Writable v2 = tb.get(index);
- if (v1 != v2 && (v1 != null && !v1.equals(v2))) {
- if (v1 instanceof WritableComparable && v2 instanceof WritableComparable) {
- int cmp = ((WritableComparable) v1).compareTo((WritableComparable) v2);
- if (cmp != 0) {
- return order * cmp;
- }
- } else {
- int cmp = v1.hashCode() - v2.hashCode();
- if (cmp != 0) {
- return order * cmp;
- }
- }
- }
- }
- }
- return 0; // ordering using specified cols found no differences
- }
-
- @Override
- public Configuration getConf() {
- return conf;
- }
-
- @Override
- public void setConf(Configuration conf) {
- this.conf = conf;
- if (conf != null) {
- String ordering = conf.get(CRUNCH_ORDERING_PROPERTY);
- String[] columnOrderNames = ordering.split(",");
- columnOrders = new ColumnOrder[columnOrderNames.length];
- for (int i = 0; i < columnOrders.length; i++) {
- String[] split = columnOrderNames[i].split(";");
- int column = Integer.parseInt(split[0]);
- Order order = Order.valueOf(split[1]);
- columnOrders[i] = ColumnOrder.by(column, order);
- }
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/sort/SortFns.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/sort/SortFns.java b/crunch/src/main/java/org/apache/crunch/lib/sort/SortFns.java
deleted file mode 100644
index be218f6..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/sort/SortFns.java
+++ /dev/null
@@ -1,210 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.sort;
-
-import java.util.List;
-import java.util.UUID;
-
-import org.apache.avro.Schema;
-import org.apache.avro.generic.GenericData;
-import org.apache.avro.generic.GenericRecord;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Tuple;
-import org.apache.crunch.lib.Sort.ColumnOrder;
-import org.apache.crunch.lib.Sort.Order;
-import org.apache.crunch.types.PType;
-import org.apache.crunch.types.PTypeFamily;
-import org.apache.crunch.types.TupleFactory;
-import org.apache.crunch.types.avro.AvroType;
-import org.apache.crunch.types.avro.AvroTypeFamily;
-import org.apache.crunch.types.avro.Avros;
-
-import com.google.common.collect.Lists;
-
-/**
- * A set of {@code DoFn}s that are used by Crunch's {@code Sort} library.
- */
-public class SortFns {
-
- /**
- * Extracts a single indexed key from a {@code Tuple} instance.
- */
- public static class SingleKeyFn<V extends Tuple, K> extends MapFn<V, K> {
- private final int index;
-
- public SingleKeyFn(int index) {
- this.index = index;
- }
-
- @Override
- public K map(V input) {
- return (K) input.get(index);
- }
- }
-
- /**
- * Extracts a composite key from a {@code Tuple} instance.
- */
- public static class TupleKeyFn<V extends Tuple, K extends Tuple> extends MapFn<V, K> {
- private final int[] indices;
- private final TupleFactory tupleFactory;
-
- public TupleKeyFn(int[] indices, TupleFactory tupleFactory) {
- this.indices = indices;
- this.tupleFactory = tupleFactory;
- }
-
- @Override
- public K map(V input) {
- Object[] values = new Object[indices.length];
- for (int i = 0; i < indices.length; i++) {
- values[i] = input.get(indices[i]);
- }
- return (K) tupleFactory.makeTuple(values);
- }
- }
-
- /**
- * Pulls a composite set of keys from an Avro {@code GenericRecord} instance.
- */
- public static class AvroGenericFn<V extends Tuple> extends MapFn<V, GenericRecord> {
-
- private final int[] indices;
- private final String schemaJson;
- private transient Schema schema;
-
- public AvroGenericFn(int[] indices, Schema schema) {
- this.indices = indices;
- this.schemaJson = schema.toString();
- }
-
- @Override
- public void initialize() {
- this.schema = (new Schema.Parser()).parse(schemaJson);
- }
-
- @Override
- public GenericRecord map(V input) {
- GenericRecord rec = new GenericData.Record(schema);
- for (int i = 0; i < indices.length; i++) {
- rec.put(i, input.get(indices[i]));
- }
- return rec;
- }
- }
-
- /**
- * Constructs an Avro schema for the given {@code PType<S>} that respects the given column
- * orderings.
- */
- public static <S> Schema createOrderedTupleSchema(PType<S> ptype, ColumnOrder[] orders) {
- // Guarantee each tuple schema has a globally unique name
- String tupleName = "tuple" + UUID.randomUUID().toString().replace('-', 'x');
- Schema schema = Schema.createRecord(tupleName, "", "crunch", false);
- List<Schema.Field> fields = Lists.newArrayList();
- AvroType<S> parentAvroType = (AvroType<S>) ptype;
- Schema parentAvroSchema = parentAvroType.getSchema();
-
- for (int index = 0; index < orders.length; index++) {
- ColumnOrder columnOrder = orders[index];
- AvroType<?> atype = (AvroType<?>) ptype.getSubTypes().get(index);
- Schema fieldSchema = atype.getSchema();
- String fieldName = parentAvroSchema.getFields().get(index).name();
- // Note: avro sorting of strings is inverted relative to how sorting works for WritableComparable
- // Text instances: making this consistent
- Schema.Field.Order order = columnOrder.order() == Order.DESCENDING ? Schema.Field.Order.DESCENDING :
- Schema.Field.Order.ASCENDING;
- fields.add(new Schema.Field(fieldName, fieldSchema, "", null, order));
- }
- schema.setFields(fields);
- return schema;
- }
-
- /**
- * Utility class for encapsulating key extraction logic and serialization information about
- * key extraction.
- */
- public static class KeyExtraction<V extends Tuple> {
-
- private PType<V> ptype;
- private final ColumnOrder[] columnOrder;
- private final int[] cols;
-
- private MapFn<V, Object> byFn;
- private PType<Object> keyPType;
-
- public KeyExtraction(PType<V> ptype, ColumnOrder[] columnOrder) {
- this.ptype = ptype;
- this.columnOrder = columnOrder;
- this.cols = new int[columnOrder.length];
- for (int i = 0; i < columnOrder.length; i++) {
- cols[i] = columnOrder[i].column() - 1;
- }
- init();
- }
-
- private void init() {
- List<PType> pt = ptype.getSubTypes();
- PTypeFamily ptf = ptype.getFamily();
- if (cols.length == 1) {
- byFn = new SingleKeyFn(cols[0]);
- keyPType = pt.get(cols[0]);
- } else {
- TupleFactory tf = null;
- switch (cols.length) {
- case 2:
- tf = TupleFactory.PAIR;
- keyPType = ptf.pairs(pt.get(cols[0]), pt.get(cols[1]));
- break;
- case 3:
- tf = TupleFactory.TUPLE3;
- keyPType = ptf.triples(pt.get(cols[0]), pt.get(cols[1]), pt.get(cols[2]));
- break;
- case 4:
- tf = TupleFactory.TUPLE4;
- keyPType = ptf.quads(pt.get(cols[0]), pt.get(cols[1]), pt.get(cols[2]), pt.get(cols[3]));
- break;
- default:
- PType[] pts = new PType[cols.length];
- for (int i = 0; i < pts.length; i++) {
- pts[i] = pt.get(cols[i]);
- }
- tf = TupleFactory.TUPLEN;
- keyPType = (PType<Object>) (PType<?>) ptf.tuples(pts);
- }
-
- if (ptf == AvroTypeFamily.getInstance()) {
- Schema s = createOrderedTupleSchema(keyPType, columnOrder);
- keyPType = (PType<Object>) (PType<?>) Avros.generics(s);
- byFn = new AvroGenericFn(cols, s);
- } else {
- byFn = new TupleKeyFn(cols, tf);
- }
- }
-
- }
-
- public MapFn<V, Object> getByFn() {
- return byFn;
- }
-
- public PType<Object> getKeyType() {
- return keyPType;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/lib/sort/TotalOrderPartitioner.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/lib/sort/TotalOrderPartitioner.java b/crunch/src/main/java/org/apache/crunch/lib/sort/TotalOrderPartitioner.java
deleted file mode 100644
index 94fbdbe..0000000
--- a/crunch/src/main/java/org/apache/crunch/lib/sort/TotalOrderPartitioner.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.lib.sort;
-
-import java.io.IOException;
-import java.lang.reflect.Array;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Iterator;
-
-import org.apache.avro.Schema;
-import org.apache.avro.mapred.AvroKey;
-import org.apache.crunch.io.CompositePathIterable;
-import org.apache.crunch.io.avro.AvroFileReaderFactory;
-import org.apache.crunch.io.seq.SeqFileReaderFactory;
-import org.apache.crunch.types.writable.WritableDeepCopier;
-import org.apache.hadoop.conf.Configurable;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.RawComparator;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Partitioner;
-
-/**
- * A partition-aware {@code Partitioner} instance that can work with either Avro or Writable-formatted
- * keys.
- */
-public class TotalOrderPartitioner<K, V> extends Partitioner<K, V> implements Configurable {
-
- public static final String DEFAULT_PATH = "_partition.lst";
- public static final String PARTITIONER_PATH =
- "crunch.totalorderpartitioner.path";
-
- private Configuration conf;
- private Node<K> partitions;
-
- @Override
- public Configuration getConf() {
- return conf;
- }
-
- @Override
- public void setConf(Configuration conf) {
- try {
- this.conf = conf;
- String parts = getPartitionFile(conf);
- final Path partFile = new Path(parts);
- final FileSystem fs = (DEFAULT_PATH.equals(parts))
- ? FileSystem.getLocal(conf) // assume in DistributedCache
- : partFile.getFileSystem(conf);
-
- Job job = new Job(conf);
- Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass();
- RawComparator<K> comparator =
- (RawComparator<K>) job.getSortComparator();
- K[] splitPoints = readPartitions(fs, partFile, keyClass, conf, comparator);
- int numReduceTasks = job.getNumReduceTasks();
- if (splitPoints.length != numReduceTasks - 1) {
- throw new IOException("Wrong number of partitions in keyset");
- }
- partitions = new BinarySearchNode(splitPoints, comparator);
- } catch (IOException e) {
- throw new IllegalArgumentException("Can't read partitions file", e);
- }
- }
-
- @Override
- public int getPartition(K key, V value, int modulo) {
- return partitions.findPartition(key);
- }
-
- public static void setPartitionFile(Configuration conf, Path p) {
- conf.set(PARTITIONER_PATH, p.toString());
- }
-
- public static String getPartitionFile(Configuration conf) {
- return conf.get(PARTITIONER_PATH, DEFAULT_PATH);
- }
-
- @SuppressWarnings("unchecked") // map output key class
- private K[] readPartitions(FileSystem fs, Path p, Class<K> keyClass,
- Configuration conf, final RawComparator<K> comparator) throws IOException {
- ArrayList<K> parts = new ArrayList<K>();
- String schema = conf.get("crunch.schema");
- if (schema != null) {
- Schema s = (new Schema.Parser()).parse(schema);
- AvroFileReaderFactory<K> a = new AvroFileReaderFactory<K>(s);
- Iterator<K> iter = CompositePathIterable.create(fs, p, a).iterator();
- while (iter.hasNext()) {
- parts.add((K) new AvroKey<K>(iter.next()));
- }
- } else {
- WritableDeepCopier wdc = new WritableDeepCopier(keyClass);
- SeqFileReaderFactory<K> s = new SeqFileReaderFactory<K>(keyClass);
- Iterator<K> iter = CompositePathIterable.create(fs, p, s).iterator();
- while (iter.hasNext()) {
- parts.add((K) wdc.deepCopy((Writable) iter.next()));
- }
- }
- Collections.sort(parts, comparator);
- return parts.toArray((K[])Array.newInstance(keyClass, parts.size()));
- }
-
- /**
- * Interface to the partitioner to locate a key in the partition keyset.
- */
- interface Node<T> {
- /**
- * Locate partition in keyset K, st [Ki..Ki+1) defines a partition,
- * with implicit K0 = -inf, Kn = +inf, and |K| = #partitions - 1.
- */
- int findPartition(T key);
- }
-
- class BinarySearchNode implements Node<K> {
- private final K[] splitPoints;
- private final RawComparator<K> comparator;
- BinarySearchNode(K[] splitPoints, RawComparator<K> comparator) {
- this.splitPoints = splitPoints;
- this.comparator = comparator;
- }
- public int findPartition(K key) {
- final int pos = Arrays.binarySearch(splitPoints, key, comparator) + 1;
- return (pos < 0) ? -pos : pos;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/materialize/MaterializableIterable.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/materialize/MaterializableIterable.java b/crunch/src/main/java/org/apache/crunch/materialize/MaterializableIterable.java
deleted file mode 100644
index 2dcc64f..0000000
--- a/crunch/src/main/java/org/apache/crunch/materialize/MaterializableIterable.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.materialize;
-
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.crunch.CrunchRuntimeException;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.SourceTarget;
-import org.apache.crunch.io.PathTarget;
-import org.apache.crunch.io.ReadableSource;
-import org.apache.crunch.io.impl.FileSourceImpl;
-import org.apache.hadoop.fs.Path;
-
-public class MaterializableIterable<E> implements Iterable<E> {
-
- private static final Log LOG = LogFactory.getLog(MaterializableIterable.class);
-
- private final Pipeline pipeline;
- private final ReadableSource<E> source;
- private Iterable<E> materialized;
-
- public MaterializableIterable(Pipeline pipeline, ReadableSource<E> source) {
- this.pipeline = pipeline;
- this.source = source;
- this.materialized = null;
- }
-
- public ReadableSource<E> getSource() {
- return source;
- }
-
- public boolean isSourceTarget() {
- return (source instanceof SourceTarget);
- }
-
- public Path getPath() {
- if (source instanceof FileSourceImpl) {
- return ((FileSourceImpl) source).getPath();
- } else if (source instanceof PathTarget) {
- return ((PathTarget) source).getPath();
- }
- return null;
- }
-
- @Override
- public Iterator<E> iterator() {
- if (materialized == null) {
- pipeline.run();
- materialize();
- }
- return materialized.iterator();
- }
-
- public void materialize() {
- try {
- materialized = source.read(pipeline.getConfiguration());
- } catch (IOException e) {
- LOG.error("Could not materialize: " + source, e);
- throw new CrunchRuntimeException(e);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/materialize/MaterializableMap.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/materialize/MaterializableMap.java b/crunch/src/main/java/org/apache/crunch/materialize/MaterializableMap.java
deleted file mode 100644
index 69082e2..0000000
--- a/crunch/src/main/java/org/apache/crunch/materialize/MaterializableMap.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.materialize;
-
-import java.util.AbstractMap;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.crunch.Pair;
-
-public class MaterializableMap<K, V> extends AbstractMap<K, V> {
-
- private Iterable<Pair<K, V>> iterable;
- private Set<Map.Entry<K, V>> entrySet;
-
- public MaterializableMap(Iterable<Pair<K, V>> iterable) {
- this.iterable = iterable;
- }
-
- private Set<Map.Entry<K, V>> toMapEntries(Iterable<Pair<K, V>> xs) {
- HashMap<K, V> m = new HashMap<K, V>();
- for (Pair<K, V> x : xs)
- m.put(x.first(), x.second());
- return m.entrySet();
- }
-
- @Override
- public Set<Map.Entry<K, V>> entrySet() {
- if (entrySet == null)
- entrySet = toMapEntries(iterable);
- return entrySet;
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/materialize/pobject/CollectionPObject.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/materialize/pobject/CollectionPObject.java b/crunch/src/main/java/org/apache/crunch/materialize/pobject/CollectionPObject.java
deleted file mode 100644
index 60e64b1..0000000
--- a/crunch/src/main/java/org/apache/crunch/materialize/pobject/CollectionPObject.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.materialize.pobject;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Iterator;
-
-import org.apache.crunch.PCollection;
-
-/**
- * A concrete implementation of {@link org.apache.crunch.materialize.pobject.PObjectImpl} whose
- * value is a Java {@link java.util.Collection} containing the elements of the underlying {@link
- * PCollection} for this {@link org.apache.crunch.PObject}.
- *
- * @param <S> The value type for elements contained in the {@code Collection} value encapsulated
- * by this {@code PObject}.
- */
-public class CollectionPObject<S> extends PObjectImpl<S, Collection<S>> {
-
- /**
- * Constructs a new instance of this {@code PObject} implementation.
- *
- * @param collect The backing {@code PCollection} for this {@code PObject}.
- */
- public CollectionPObject(PCollection<S> collect) {
- super(collect);
- }
-
- /** {@inheritDoc} */
- @Override
- public Collection<S> process(Iterable<S> input) {
- Collection<S> target = new ArrayList<S>();
- Iterator<S> itr = input.iterator();
- while (itr.hasNext()) {
- target.add(itr.next());
- }
- return target;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/materialize/pobject/FirstElementPObject.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/materialize/pobject/FirstElementPObject.java b/crunch/src/main/java/org/apache/crunch/materialize/pobject/FirstElementPObject.java
deleted file mode 100644
index aa5fd9e..0000000
--- a/crunch/src/main/java/org/apache/crunch/materialize/pobject/FirstElementPObject.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.materialize.pobject;
-
-import java.util.Iterator;
-
-import org.apache.crunch.PCollection;
-
-/**
- * A concrete implementation of {@link PObjectImpl} that uses the first element in the backing
- * {@link PCollection} as the {@link org.apache.crunch.PObject} value.
- *
- * @param <T> The value type of this {@code PObject}.
- */
-public class FirstElementPObject<T> extends PObjectImpl<T, T> {
-
- /**
- * Constructs a new instance of this {@code PObject} implementation.
- *
- * @param collect The backing {@code PCollection} for this {@code PObject}.
- */
- public FirstElementPObject(PCollection<T> collect) {
- super(collect);
- }
-
- /** {@inheritDoc} */
- @Override
- public T process(Iterable<T> input) {
- Iterator<T> itr = input.iterator();
- if (itr.hasNext()) {
- return itr.next();
- }
- return null;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/materialize/pobject/MapPObject.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/materialize/pobject/MapPObject.java b/crunch/src/main/java/org/apache/crunch/materialize/pobject/MapPObject.java
deleted file mode 100644
index 243997f..0000000
--- a/crunch/src/main/java/org/apache/crunch/materialize/pobject/MapPObject.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.materialize.pobject;
-
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-
-import org.apache.crunch.PCollection;
-import org.apache.crunch.Pair;
-
-/**
- * A concrete implementation of {@link PObjectImpl} whose
- * value is a Java {@link Map}. The underlying {@link PCollection} for this
- * {@link org.apache.crunch.PObject} must contain {@link Pair}s of values. The
- * first element of the pair will be used as the map key, while the second element will be used
- * as the map value. Note that the contents of the underlying {@code PCollection} may not be
- * reflected in the returned {@code Map}, since a single key may be mapped to several values in
- * the underlying {@code PCollection}, and only one of those values will appear in the {@code
- * Map} encapsulated by this {@code PObject}.
- *
- * @param <K> The type of keys for the Map.
- * @param <V> The type of values for the Map.
- */
-public class MapPObject<K, V> extends PObjectImpl<Pair<K, V>, Map<K, V>> {
-
- /**
- * Constructs a new instance of this {@code PObject} implementation.
- *
- * @param collect The backing {@code PCollection} for this {@code PObject}.
- */
- public MapPObject(PCollection<Pair<K, V>> collect) {
- super(collect);
- }
-
- /** {@inheritDoc} */
- @Override
- public Map<K, V> process(Iterable<Pair<K, V>> input) {
- Map<K, V> target = new HashMap<K, V>();
- Iterator<Pair<K, V>> itr = input.iterator();
- while (itr.hasNext()) {
- Pair<K, V> pair = itr.next();
- target.put(pair.first(), pair.second());
- }
- return target;
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/materialize/pobject/PObjectImpl.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/materialize/pobject/PObjectImpl.java b/crunch/src/main/java/org/apache/crunch/materialize/pobject/PObjectImpl.java
deleted file mode 100644
index 59c2ba2..0000000
--- a/crunch/src/main/java/org/apache/crunch/materialize/pobject/PObjectImpl.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.materialize.pobject;
-
-import org.apache.crunch.PCollection;
-import org.apache.crunch.PObject;
-import org.apache.crunch.Pipeline;
-import org.apache.crunch.Target;
-
-/**
- * An abstract implementation of {@link PObject} that is backed by a {@link PCollection}.
- * Clients creating a concrete implementation should override the method
- * {@link PObjectImpl#process(Iterable)}, which transforms the backing PCollection into the
- * singleton value encapsulated by the PObject. Once this {code PObject}'s value has been
- * calculated, the value is cached to prevent subsequent materializations of the backing
- * {@code PCollection}.
- *
- * @param <S> The type contained in the underlying PCollection.
- * @param <T> The type encapsulated by this PObject.
- */
-public abstract class PObjectImpl<S, T> implements PObject<T> {
-
- // The underlying PCollection whose contents will be used to generate the value for this
- // PObject.
- private PCollection<S> collection;
-
- // A variable to hold a cached copy of the value of this {@code PObject},
- // to prevent unnecessary materializations of the backing {@code PCollection}.
- private T cachedValue;
-
- // A flag indicating if a value for this {@code PObject} has been cached.
- private boolean isCached;
-
- /**
- * Constructs a new instance of this {@code PObject} implementation.
- *
- * @param collect The backing {@code PCollection} for this {@code PObject}.
- */
- public PObjectImpl(PCollection<S> collect) {
- this.collection = collect;
- this.cachedValue = null;
- this.isCached = false;
- }
-
- /** {@inheritDoc} */
- @Override
- public String toString() {
- return collection.toString();
- }
-
- /** {@inheritDoc} */
- @Override
- public final T getValue() {
- if (!isCached) {
- cachedValue = process(collection.materialize());
- isCached = true;
- }
- return cachedValue;
- }
-
- /**
- * Transforms the provided Iterable, obtained from the backing {@link PCollection},
- * into the value encapsulated by this {@code PObject}.
- *
- * @param input An Iterable whose elements correspond to those of the backing {@code
- * PCollection}.
- * @return The value of this {@code PObject}.
- */
- protected abstract T process(Iterable<S> input);
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/package-info.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/package-info.java b/crunch/src/main/java/org/apache/crunch/package-info.java
deleted file mode 100644
index 38f11bc..0000000
--- a/crunch/src/main/java/org/apache/crunch/package-info.java
+++ /dev/null
@@ -1,25 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Client-facing API and core abstractions.
- *
- * @see <a href="http://crunch.apache.org/intro.html">Introduction to
- * Apache Crunch</a>
- */
-package org.apache.crunch;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/CollectionDeepCopier.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/CollectionDeepCopier.java b/crunch/src/main/java/org/apache/crunch/types/CollectionDeepCopier.java
deleted file mode 100644
index 151ab82..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/CollectionDeepCopier.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import java.util.Collection;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-
-import com.google.common.collect.Lists;
-
-/**
- * Performs deep copies (based on underlying PType deep copying) of Collections.
- *
- * @param <T> The type of Tuple implementation being copied
- */
-public class CollectionDeepCopier<T> implements DeepCopier<Collection<T>> {
-
- private PType<T> elementType;
-
- public CollectionDeepCopier(PType<T> elementType) {
- this.elementType = elementType;
- }
-
- @Override
- public void initialize(Configuration conf) {
- this.elementType.initialize(conf);
- }
-
- @Override
- public Collection<T> deepCopy(Collection<T> source) {
- if (source == null) {
- return null;
- }
- List<T> copiedCollection = Lists.newArrayListWithCapacity(source.size());
- for (T value : source) {
- copiedCollection.add(elementType.getDetachedValue(value));
- }
- return copiedCollection;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/Converter.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/Converter.java b/crunch/src/main/java/org/apache/crunch/types/Converter.java
deleted file mode 100644
index a0dbb16..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/Converter.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import java.io.Serializable;
-
-import org.apache.crunch.DoFn;
-
-/**
- * Converts the input key/value from a MapReduce task into the input to a
- * {@link DoFn}, or takes the output of a {@code DoFn} and write it to the
- * output key/values.
- */
-public interface Converter<K, V, S, T> extends Serializable {
- S convertInput(K key, V value);
-
- T convertIterableInput(K key, Iterable<V> value);
-
- K outputKey(S value);
-
- V outputValue(S value);
-
- Class<K> getKeyClass();
-
- Class<V> getValueClass();
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/DeepCopier.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/DeepCopier.java b/crunch/src/main/java/org/apache/crunch/types/DeepCopier.java
deleted file mode 100644
index f146e86..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/DeepCopier.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import java.io.Serializable;
-
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * Performs deep copies of values.
- *
- * @param <T> The type of value that will be copied
- */
-public interface DeepCopier<T> extends Serializable {
-
- /**
- * Initialize the deep copier with a job-specific configuration
- *
- * @param conf Job-specific configuration
- */
- void initialize(Configuration conf);
-
- /**
- * Create a deep copy of a value.
- *
- * @param source The value to be copied
- * @return The deep copy of the value
- */
- T deepCopy(T source);
-
- static class NoOpDeepCopier<V> implements DeepCopier<V> {
-
- @Override
- public V deepCopy(V source) {
- return source;
- }
-
- @Override
- public void initialize(Configuration conf) {
- // No initialization needed
- }
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/MapDeepCopier.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/MapDeepCopier.java b/crunch/src/main/java/org/apache/crunch/types/MapDeepCopier.java
deleted file mode 100644
index de8903b..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/MapDeepCopier.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import java.util.Map;
-import java.util.Map.Entry;
-
-import org.apache.hadoop.conf.Configuration;
-
-import com.google.common.collect.Maps;
-
-public class MapDeepCopier<T> implements DeepCopier<Map<String, T>> {
-
- private final PType<T> ptype;
-
- public MapDeepCopier(PType<T> ptype) {
- this.ptype = ptype;
- }
-
- @Override
- public void initialize(Configuration conf) {
- this.ptype.initialize(conf);
- }
-
- @Override
- public Map<String, T> deepCopy(Map<String, T> source) {
- if (source == null) {
- return null;
- }
-
- Map<String, T> deepCopyMap = Maps.newHashMap();
- for (Entry<String, T> entry : source.entrySet()) {
- deepCopyMap.put(entry.getKey(), ptype.getDetachedValue(entry.getValue()));
- }
- return deepCopyMap;
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/PGroupedTableType.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/PGroupedTableType.java b/crunch/src/main/java/org/apache/crunch/types/PGroupedTableType.java
deleted file mode 100644
index d276cd6..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/PGroupedTableType.java
+++ /dev/null
@@ -1,141 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.crunch.GroupingOptions;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PGroupedTable;
-import org.apache.crunch.Pair;
-import org.apache.crunch.io.ReadableSourceTarget;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.TaskInputOutputContext;
-
-import com.google.common.collect.Iterables;
-
-/**
- * The {@code PType} instance for {@link PGroupedTable} instances. Its settings
- * are derived from the {@code PTableType} that was grouped to create the
- * {@code PGroupedTable} instance.
- *
- */
-public abstract class PGroupedTableType<K, V> implements PType<Pair<K, Iterable<V>>> {
-
- protected static class PTypeIterable<V> implements Iterable<V> {
- private final Iterable<Object> iterable;
- private final MapFn<Object, V> mapFn;
-
- public PTypeIterable(MapFn<Object, V> mapFn, Iterable<Object> iterable) {
- this.mapFn = mapFn;
- this.iterable = iterable;
- }
-
- public Iterator<V> iterator() {
- return new Iterator<V>() {
- Iterator<Object> iter = iterable.iterator();
-
- public boolean hasNext() {
- return iter.hasNext();
- }
-
- public V next() {
- return mapFn.map(iter.next());
- }
-
- public void remove() {
- iter.remove();
- }
- };
- }
-
- @Override
- public String toString() {
- return Iterables.toString(this);
- }
- }
-
- public static class PairIterableMapFn<K, V> extends MapFn<Pair<Object, Iterable<Object>>, Pair<K, Iterable<V>>> {
- private final MapFn<Object, K> keys;
- private final MapFn<Object, V> values;
-
- public PairIterableMapFn(MapFn<Object, K> keys, MapFn<Object, V> values) {
- this.keys = keys;
- this.values = values;
- }
-
- @Override
- public void configure(Configuration conf) {
- keys.configure(conf);
- values.configure(conf);
- }
-
- public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
- keys.setContext(context);
- values.setContext(context);
- }
-
- @Override
- public void initialize() {
- keys.initialize();
- values.initialize();
- }
-
- @Override
- public Pair<K, Iterable<V>> map(Pair<Object, Iterable<Object>> input) {
- return Pair.<K, Iterable<V>> of(keys.map(input.first()), new PTypeIterable(values, input.second()));
- }
- }
-
- protected final PTableType<K, V> tableType;
-
- public PGroupedTableType(PTableType<K, V> tableType) {
- this.tableType = tableType;
- }
-
- public PTableType<K, V> getTableType() {
- return tableType;
- }
-
- @Override
- public PTypeFamily getFamily() {
- return tableType.getFamily();
- }
-
- @Override
- public List<PType> getSubTypes() {
- return tableType.getSubTypes();
- }
-
- @Override
- public Converter getConverter() {
- return tableType.getConverter();
- }
-
- public abstract Converter getGroupingConverter();
-
- public abstract void configureShuffle(Job job, GroupingOptions options);
-
- @Override
- public ReadableSourceTarget<Pair<K, Iterable<V>>> getDefaultFileSource(Path path) {
- throw new UnsupportedOperationException("Grouped tables cannot be written out directly");
- }
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/PTableType.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/PTableType.java b/crunch/src/main/java/org/apache/crunch/types/PTableType.java
deleted file mode 100644
index 3d06f8b..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/PTableType.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import org.apache.crunch.PTable;
-import org.apache.crunch.Pair;
-
-/**
- * An extension of {@code PType} specifically for {@link PTable} objects. It
- * allows separate access to the {@code PType}s of the key and value for the
- * {@code PTable}.
- *
- */
-public interface PTableType<K, V> extends PType<Pair<K, V>> {
- /**
- * Returns the key type for the table.
- */
- PType<K> getKeyType();
-
- /**
- * Returns the value type for the table.
- */
- PType<V> getValueType();
-
- /**
- * Returns the grouped table version of this type.
- */
- PGroupedTableType<K, V> getGroupedTableType();
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/PType.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/PType.java b/crunch/src/main/java/org/apache/crunch/types/PType.java
deleted file mode 100644
index ebddf84..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/PType.java
+++ /dev/null
@@ -1,86 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import java.io.Serializable;
-import java.util.List;
-
-import org.apache.crunch.DoFn;
-import org.apache.crunch.MapFn;
-import org.apache.crunch.PCollection;
-import org.apache.crunch.io.ReadableSourceTarget;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-
-/**
- * A {@code PType} defines a mapping between a data type that is used in a Crunch pipeline and a
- * serialization and storage format that is used to read/write data from/to HDFS. Every
- * {@link PCollection} has an associated {@code PType} that tells Crunch how to read/write data from
- * that {@code PCollection}.
- *
- */
-public interface PType<T> extends Serializable {
- /**
- * Returns the Java type represented by this {@code PType}.
- */
- Class<T> getTypeClass();
-
- /**
- * Returns the {@code PTypeFamily} that this {@code PType} belongs to.
- */
- PTypeFamily getFamily();
-
- MapFn<Object, T> getInputMapFn();
-
- MapFn<T, Object> getOutputMapFn();
-
- Converter getConverter();
-
- /**
- * Initialize this PType for use within a DoFn. This generally only needs to be called when using
- * a PType for {@link #getDetachedValue(Object)}.
- *
- * @param conf Configuration object
- * @see PType#getDetachedValue(Object)
- */
- void initialize(Configuration conf);
-
- /**
- * Returns a copy of a value (or the value itself) that can safely be retained.
- * <p>
- * This is useful when iterable values being processed in a DoFn (via a reducer) need to be held
- * on to for more than the scope of a single iteration, as a reducer (and therefore also a DoFn
- * that has an Iterable as input) re-use deserialized values. More information on object reuse is
- * available in the {@link DoFn} class documentation.
- *
- * @param value The value to be deep-copied
- * @return A deep copy of the input value
- */
- T getDetachedValue(T value);
-
- /**
- * Returns a {@code SourceTarget} that is able to read/write data using the serialization format
- * specified by this {@code PType}.
- */
- ReadableSourceTarget<T> getDefaultFileSource(Path path);
-
- /**
- * Returns the sub-types that make up this PType if it is a composite instance, such as a tuple.
- */
- List<PType> getSubTypes();
-}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch/src/main/java/org/apache/crunch/types/PTypeFamily.java
----------------------------------------------------------------------
diff --git a/crunch/src/main/java/org/apache/crunch/types/PTypeFamily.java b/crunch/src/main/java/org/apache/crunch/types/PTypeFamily.java
deleted file mode 100644
index 9458f14..0000000
--- a/crunch/src/main/java/org/apache/crunch/types/PTypeFamily.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.crunch.types;
-
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.Map;
-
-import org.apache.crunch.MapFn;
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.Tuple4;
-import org.apache.crunch.TupleN;
-
-/**
- * An abstract factory for creating {@code PType} instances that have the same
- * serialization/storage backing format.
- *
- */
-public interface PTypeFamily {
- PType<Void> nulls();
-
- PType<String> strings();
-
- PType<Long> longs();
-
- PType<Integer> ints();
-
- PType<Float> floats();
-
- PType<Double> doubles();
-
- PType<Boolean> booleans();
-
- PType<ByteBuffer> bytes();
-
- <T> PType<T> records(Class<T> clazz);
-
- <T> PType<Collection<T>> collections(PType<T> ptype);
-
- <T> PType<Map<String, T>> maps(PType<T> ptype);
-
- <V1, V2> PType<Pair<V1, V2>> pairs(PType<V1> p1, PType<V2> p2);
-
- <V1, V2, V3> PType<Tuple3<V1, V2, V3>> triples(PType<V1> p1, PType<V2> p2, PType<V3> p3);
-
- <V1, V2, V3, V4> PType<Tuple4<V1, V2, V3, V4>> quads(PType<V1> p1, PType<V2> p2, PType<V3> p3, PType<V4> p4);
-
- PType<TupleN> tuples(PType<?>... ptypes);
-
- <T extends Tuple> PType<T> tuples(Class<T> clazz, PType<?>... ptypes);
-
- <S, T> PType<T> derived(Class<T> clazz, MapFn<S, T> inputFn, MapFn<T, S> outputFn, PType<S> base);
-
- <K, V> PTableType<K, V> tableOf(PType<K> key, PType<V> value);
-
- /**
- * Returns the equivalent of the given ptype for this family, if it exists.
- */
- <T> PType<T> as(PType<T> ptype);
-}
[25/43] CRUNCH-196: crunch -> crunch-core rename to fix build issues
Posted by jw...@apache.org.
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/writable/Writables.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/writable/Writables.java b/crunch-core/src/main/java/org/apache/crunch/types/writable/Writables.java
new file mode 100644
index 0000000..78cf3ae
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/writable/Writables.java
@@ -0,0 +1,588 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.types.writable;
+
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.crunch.MapFn;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Tuple;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+import org.apache.crunch.fn.CompositeMapFn;
+import org.apache.crunch.fn.IdentityFn;
+import org.apache.crunch.types.PType;
+import org.apache.crunch.types.PTypes;
+import org.apache.crunch.types.TupleFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.FloatWritable;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+/**
+ * Defines static methods that are analogous to the methods defined in
+ * {@link WritableTypeFamily} for convenient static importing.
+ *
+ */
+public class Writables {
+ private static final MapFn<NullWritable, Void> NULL_WRITABLE_TO_VOID = new MapFn<NullWritable, Void>() {
+ @Override
+ public Void map(NullWritable input) {
+ return null;
+ }
+ };
+
+ private static final MapFn<Void, NullWritable> VOID_TO_NULL_WRITABLE = new MapFn<Void, NullWritable>() {
+ @Override
+ public NullWritable map(Void input) {
+ return NullWritable.get();
+ }
+ };
+
+ private static final MapFn<Text, String> TEXT_TO_STRING = new MapFn<Text, String>() {
+ @Override
+ public String map(Text input) {
+ return input.toString();
+ }
+ };
+
+ private static final MapFn<String, Text> STRING_TO_TEXT = new MapFn<String, Text>() {
+ @Override
+ public Text map(String input) {
+ return new Text(input);
+ }
+ };
+
+ private static final MapFn<IntWritable, Integer> IW_TO_INT = new MapFn<IntWritable, Integer>() {
+ @Override
+ public Integer map(IntWritable input) {
+ return input.get();
+ }
+ };
+
+ private static final MapFn<Integer, IntWritable> INT_TO_IW = new MapFn<Integer, IntWritable>() {
+ @Override
+ public IntWritable map(Integer input) {
+ return new IntWritable(input);
+ }
+ };
+
+ private static final MapFn<LongWritable, Long> LW_TO_LONG = new MapFn<LongWritable, Long>() {
+ @Override
+ public Long map(LongWritable input) {
+ return input.get();
+ }
+ };
+
+ private static final MapFn<Long, LongWritable> LONG_TO_LW = new MapFn<Long, LongWritable>() {
+ @Override
+ public LongWritable map(Long input) {
+ return new LongWritable(input);
+ }
+ };
+
+ private static final MapFn<FloatWritable, Float> FW_TO_FLOAT = new MapFn<FloatWritable, Float>() {
+ @Override
+ public Float map(FloatWritable input) {
+ return input.get();
+ }
+ };
+
+ private static final MapFn<Float, FloatWritable> FLOAT_TO_FW = new MapFn<Float, FloatWritable>() {
+ @Override
+ public FloatWritable map(Float input) {
+ return new FloatWritable(input);
+ }
+ };
+
+ private static final MapFn<DoubleWritable, Double> DW_TO_DOUBLE = new MapFn<DoubleWritable, Double>() {
+ @Override
+ public Double map(DoubleWritable input) {
+ return input.get();
+ }
+ };
+
+ private static final MapFn<Double, DoubleWritable> DOUBLE_TO_DW = new MapFn<Double, DoubleWritable>() {
+ @Override
+ public DoubleWritable map(Double input) {
+ return new DoubleWritable(input);
+ }
+ };
+
+ private static final MapFn<BooleanWritable, Boolean> BW_TO_BOOLEAN = new MapFn<BooleanWritable, Boolean>() {
+ @Override
+ public Boolean map(BooleanWritable input) {
+ return input.get();
+ }
+ };
+
+ private static final BooleanWritable TRUE = new BooleanWritable(true);
+ private static final BooleanWritable FALSE = new BooleanWritable(false);
+ private static final MapFn<Boolean, BooleanWritable> BOOLEAN_TO_BW = new MapFn<Boolean, BooleanWritable>() {
+ @Override
+ public BooleanWritable map(Boolean input) {
+ return input == Boolean.TRUE ? TRUE : FALSE;
+ }
+ };
+
+ private static final MapFn<BytesWritable, ByteBuffer> BW_TO_BB = new MapFn<BytesWritable, ByteBuffer>() {
+ @Override
+ public ByteBuffer map(BytesWritable input) {
+ return ByteBuffer.wrap(input.getBytes(), 0, input.getLength());
+ }
+ };
+
+ private static final MapFn<ByteBuffer, BytesWritable> BB_TO_BW = new MapFn<ByteBuffer, BytesWritable>() {
+ @Override
+ public BytesWritable map(ByteBuffer input) {
+ BytesWritable bw = new BytesWritable();
+ bw.set(input.array(), input.arrayOffset(), input.limit());
+ return bw;
+ }
+ };
+
+ private static <S, W extends Writable> WritableType<S, W> create(Class<S> typeClass, Class<W> writableClass,
+ MapFn<W, S> inputDoFn, MapFn<S, W> outputDoFn) {
+ return new WritableType<S, W>(typeClass, writableClass, inputDoFn, outputDoFn);
+ }
+
+ private static final WritableType<Void, NullWritable> nulls = create(Void.class, NullWritable.class,
+ NULL_WRITABLE_TO_VOID, VOID_TO_NULL_WRITABLE);
+ private static final WritableType<String, Text> strings = create(String.class, Text.class, TEXT_TO_STRING,
+ STRING_TO_TEXT);
+ private static final WritableType<Long, LongWritable> longs = create(Long.class, LongWritable.class, LW_TO_LONG,
+ LONG_TO_LW);
+ private static final WritableType<Integer, IntWritable> ints = create(Integer.class, IntWritable.class, IW_TO_INT,
+ INT_TO_IW);
+ private static final WritableType<Float, FloatWritable> floats = create(Float.class, FloatWritable.class,
+ FW_TO_FLOAT, FLOAT_TO_FW);
+ private static final WritableType<Double, DoubleWritable> doubles = create(Double.class, DoubleWritable.class,
+ DW_TO_DOUBLE, DOUBLE_TO_DW);
+ private static final WritableType<Boolean, BooleanWritable> booleans = create(Boolean.class, BooleanWritable.class,
+ BW_TO_BOOLEAN, BOOLEAN_TO_BW);
+ private static final WritableType<ByteBuffer, BytesWritable> bytes = create(ByteBuffer.class, BytesWritable.class,
+ BW_TO_BB, BB_TO_BW);
+
+ private static final Map<Class<?>, PType<?>> PRIMITIVES = ImmutableMap.<Class<?>, PType<?>> builder()
+ .put(String.class, strings).put(Long.class, longs).put(Integer.class, ints).put(Float.class, floats)
+ .put(Double.class, doubles).put(Boolean.class, booleans).put(ByteBuffer.class, bytes).build();
+
+ private static final Map<Class<?>, WritableType<?, ?>> EXTENSIONS = Maps.newHashMap();
+
+ public static <T> PType<T> getPrimitiveType(Class<T> clazz) {
+ return (PType<T>) PRIMITIVES.get(clazz);
+ }
+
+ public static <T> void register(Class<T> clazz, WritableType<T, ? extends Writable> ptype) {
+ EXTENSIONS.put(clazz, ptype);
+ }
+
+ public static final WritableType<Void, NullWritable> nulls() {
+ return nulls;
+ }
+
+ public static final WritableType<String, Text> strings() {
+ return strings;
+ }
+
+ public static final WritableType<Long, LongWritable> longs() {
+ return longs;
+ }
+
+ public static final WritableType<Integer, IntWritable> ints() {
+ return ints;
+ }
+
+ public static final WritableType<Float, FloatWritable> floats() {
+ return floats;
+ }
+
+ public static final WritableType<Double, DoubleWritable> doubles() {
+ return doubles;
+ }
+
+ public static final WritableType<Boolean, BooleanWritable> booleans() {
+ return booleans;
+ }
+
+ public static final WritableType<ByteBuffer, BytesWritable> bytes() {
+ return bytes;
+ }
+
+ public static final <T, W extends Writable> WritableType<T, W> records(Class<T> clazz) {
+ if (EXTENSIONS.containsKey(clazz)) {
+ return (WritableType<T, W>) EXTENSIONS.get(clazz);
+ }
+ if (Writable.class.isAssignableFrom(clazz)) {
+ return (WritableType<T, W>) writables(clazz.asSubclass(Writable.class));
+ } else {
+ throw new IllegalArgumentException(
+ "Cannot create Writable records from non-Writable class"+ clazz.getCanonicalName());
+ }
+ }
+
+ public static <W extends Writable> WritableType<W, W> writables(Class<W> clazz) {
+ MapFn wIdentity = IdentityFn.getInstance();
+ return new WritableType<W, W>(clazz, clazz, wIdentity, wIdentity);
+ }
+
+ public static <K, V> WritableTableType<K, V> tableOf(PType<K> key, PType<V> value) {
+ if (key instanceof WritableTableType) {
+ WritableTableType wtt = (WritableTableType) key;
+ key = pairs(wtt.getKeyType(), wtt.getValueType());
+ } else if (!(key instanceof WritableType)) {
+ throw new IllegalArgumentException("Key type must be of class WritableType");
+ }
+ if (value instanceof WritableTableType) {
+ WritableTableType wtt = (WritableTableType) value;
+ value = pairs(wtt.getKeyType(), wtt.getValueType());
+ } else if (!(value instanceof WritableType)) {
+ throw new IllegalArgumentException("Value type must be of class WritableType");
+ }
+ return new WritableTableType((WritableType) key, (WritableType) value);
+ }
+
+ /**
+ * For mapping from {@link TupleWritable} instances to {@link Tuple}s.
+ *
+ */
+ private static class TWTupleMapFn extends MapFn<TupleWritable, Tuple> {
+ private final TupleFactory<?> tupleFactory;
+ private final List<MapFn> fns;
+
+ private transient Object[] values;
+
+ public TWTupleMapFn(TupleFactory<?> tupleFactory, PType<?>... ptypes) {
+ this.tupleFactory = tupleFactory;
+ this.fns = Lists.newArrayList();
+ for (PType ptype : ptypes) {
+ fns.add(ptype.getInputMapFn());
+ }
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ for (MapFn fn : fns) {
+ fn.configure(conf);
+ }
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ for (MapFn fn : fns) {
+ fn.setContext(context);
+ }
+ }
+
+ @Override
+ public void initialize() {
+ for (MapFn fn : fns) {
+ fn.initialize();
+ }
+ // The rest of the methods allocate new
+ // objects each time. However this one
+ // uses Tuple.tuplify which does a copy
+ this.values = new Object[fns.size()];
+ tupleFactory.initialize();
+ }
+
+ @Override
+ public Tuple map(TupleWritable in) {
+ for (int i = 0; i < values.length; i++) {
+ if (in.has(i)) {
+ values[i] = fns.get(i).map(in.get(i));
+ } else {
+ values[i] = null;
+ }
+ }
+ return tupleFactory.makeTuple(values);
+ }
+ }
+
+ /**
+ * For mapping from {@code Tuple}s to {@code TupleWritable}s.
+ *
+ */
+ private static class TupleTWMapFn extends MapFn<Tuple, TupleWritable> {
+
+ private transient TupleWritable writable;
+ private transient Writable[] values;
+
+ private final List<MapFn> fns;
+
+ public TupleTWMapFn(PType<?>... ptypes) {
+ this.fns = Lists.newArrayList();
+ for (PType<?> ptype : ptypes) {
+ fns.add(ptype.getOutputMapFn());
+ }
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ for (MapFn fn : fns) {
+ fn.configure(conf);
+ }
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ for (MapFn fn : fns) {
+ fn.setContext(context);
+ }
+ }
+
+ @Override
+ public void initialize() {
+ this.values = new Writable[fns.size()];
+ this.writable = new TupleWritable(values);
+ for (MapFn fn : fns) {
+ fn.initialize();
+ }
+ }
+
+ @Override
+ public TupleWritable map(Tuple input) {
+ writable.clearWritten();
+ for (int i = 0; i < input.size(); i++) {
+ Object value = input.get(i);
+ if (value != null) {
+ writable.setWritten(i);
+ values[i] = (Writable) fns.get(i).map(value);
+ }
+ }
+ return writable;
+ }
+ }
+
+ public static <V1, V2> WritableType<Pair<V1, V2>, TupleWritable> pairs(PType<V1> p1, PType<V2> p2) {
+ TWTupleMapFn input = new TWTupleMapFn(TupleFactory.PAIR, p1, p2);
+ TupleTWMapFn output = new TupleTWMapFn(p1, p2);
+ return new WritableType(Pair.class, TupleWritable.class, input, output, p1, p2);
+ }
+
+ public static <V1, V2, V3> WritableType<Tuple3<V1, V2, V3>, TupleWritable> triples(PType<V1> p1, PType<V2> p2,
+ PType<V3> p3) {
+ TWTupleMapFn input = new TWTupleMapFn(TupleFactory.TUPLE3, p1, p2, p3);
+ TupleTWMapFn output = new TupleTWMapFn(p1, p2, p3);
+ return new WritableType(Tuple3.class, TupleWritable.class, input, output, p1, p2, p3);
+ }
+
+ public static <V1, V2, V3, V4> WritableType<Tuple4<V1, V2, V3, V4>, TupleWritable> quads(PType<V1> p1, PType<V2> p2,
+ PType<V3> p3, PType<V4> p4) {
+ TWTupleMapFn input = new TWTupleMapFn(TupleFactory.TUPLE4, p1, p2, p3, p4);
+ TupleTWMapFn output = new TupleTWMapFn(p1, p2, p3, p4);
+ return new WritableType(Tuple4.class, TupleWritable.class, input, output, p1, p2, p3, p4);
+ }
+
+ public static WritableType<TupleN, TupleWritable> tuples(PType... ptypes) {
+ TWTupleMapFn input = new TWTupleMapFn(TupleFactory.TUPLEN, ptypes);
+ TupleTWMapFn output = new TupleTWMapFn(ptypes);
+ return new WritableType(TupleN.class, TupleWritable.class, input, output, ptypes);
+ }
+
+ public static <T extends Tuple> PType<T> tuples(Class<T> clazz, PType... ptypes) {
+ Class[] typeArgs = new Class[ptypes.length];
+ for (int i = 0; i < typeArgs.length; i++) {
+ typeArgs[i] = ptypes[i].getTypeClass();
+ }
+ TupleFactory<T> factory = TupleFactory.create(clazz, typeArgs);
+ TWTupleMapFn input = new TWTupleMapFn(factory, ptypes);
+ TupleTWMapFn output = new TupleTWMapFn(ptypes);
+ return new WritableType(clazz, TupleWritable.class, input, output, ptypes);
+ }
+
+ public static <S, T> PType<T> derived(Class<T> clazz, MapFn<S, T> inputFn, MapFn<T, S> outputFn, PType<S> base) {
+ WritableType<S, ?> wt = (WritableType<S, ?>) base;
+ MapFn input = new CompositeMapFn(wt.getInputMapFn(), inputFn);
+ MapFn output = new CompositeMapFn(outputFn, wt.getOutputMapFn());
+ return new WritableType(clazz, wt.getSerializationClass(), input, output, base.getSubTypes().toArray(new PType[0]));
+ }
+
+ private static class ArrayCollectionMapFn<T> extends MapFn<GenericArrayWritable, Collection<T>> {
+ private final MapFn<Object, T> mapFn;
+
+ public ArrayCollectionMapFn(MapFn<Object, T> mapFn) {
+ this.mapFn = mapFn;
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ mapFn.configure(conf);
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ mapFn.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ mapFn.initialize();
+ }
+
+ @Override
+ public Collection<T> map(GenericArrayWritable input) {
+ Collection<T> collection = Lists.newArrayList();
+ for (Writable writable : input.get()) {
+ collection.add(mapFn.map(writable));
+ }
+ return collection;
+ }
+ }
+
+ private static class CollectionArrayMapFn<T> extends MapFn<Collection<T>, GenericArrayWritable> {
+
+ private final Class<? extends Writable> clazz;
+ private final MapFn<T, Object> mapFn;
+
+ public CollectionArrayMapFn(Class<? extends Writable> clazz, MapFn<T, Object> mapFn) {
+ this.clazz = clazz;
+ this.mapFn = mapFn;
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ mapFn.configure(conf);
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ mapFn.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ mapFn.initialize();
+ }
+
+ @Override
+ public GenericArrayWritable map(Collection<T> input) {
+ GenericArrayWritable arrayWritable = new GenericArrayWritable(clazz);
+ Writable[] w = new Writable[input.size()];
+ int index = 0;
+ for (T in : input) {
+ w[index++] = ((Writable) mapFn.map(in));
+ }
+ arrayWritable.set(w);
+ return arrayWritable;
+ }
+ }
+
+ public static <T> WritableType<Collection<T>, GenericArrayWritable<T>> collections(PType<T> ptype) {
+ WritableType<T, ?> wt = (WritableType<T, ?>) ptype;
+ return new WritableType(Collection.class, GenericArrayWritable.class, new ArrayCollectionMapFn(wt.getInputMapFn()),
+ new CollectionArrayMapFn(wt.getSerializationClass(), wt.getOutputMapFn()), ptype);
+ }
+
+ private static class MapInputMapFn<T> extends MapFn<TextMapWritable<Writable>, Map<String, T>> {
+ private final MapFn<Writable, T> mapFn;
+
+ public MapInputMapFn(MapFn<Writable, T> mapFn) {
+ this.mapFn = mapFn;
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ mapFn.configure(conf);
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ mapFn.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ mapFn.initialize();
+ }
+
+ @Override
+ public Map<String, T> map(TextMapWritable<Writable> input) {
+ Map<String, T> out = Maps.newHashMap();
+ for (Map.Entry<Text, Writable> e : input.entrySet()) {
+ out.put(e.getKey().toString(), mapFn.map(e.getValue()));
+ }
+ return out;
+ }
+ }
+
+ private static class MapOutputMapFn<T> extends MapFn<Map<String, T>, TextMapWritable<Writable>> {
+
+ private final Class<Writable> clazz;
+ private final MapFn<T, Writable> mapFn;
+
+ public MapOutputMapFn(Class<Writable> clazz, MapFn<T, Writable> mapFn) {
+ this.clazz = clazz;
+ this.mapFn = mapFn;
+ }
+
+ @Override
+ public void configure(Configuration conf) {
+ mapFn.configure(conf);
+ }
+
+ @Override
+ public void setContext(TaskInputOutputContext<?, ?, ?, ?> context) {
+ mapFn.setContext(context);
+ }
+
+ @Override
+ public void initialize() {
+ mapFn.initialize();
+ }
+
+ @Override
+ public TextMapWritable<Writable> map(Map<String, T> input) {
+ TextMapWritable<Writable> tmw = new TextMapWritable<Writable>(clazz);
+ for (Map.Entry<String, T> e : input.entrySet()) {
+ tmw.put(new Text(e.getKey()), mapFn.map(e.getValue()));
+ }
+ return tmw;
+ }
+ }
+
+ public static <T> WritableType<Map<String, T>, MapWritable> maps(PType<T> ptype) {
+ WritableType<T, ?> wt = (WritableType<T, ?>) ptype;
+ return new WritableType(Map.class, TextMapWritable.class, new MapInputMapFn(wt.getInputMapFn()),
+ new MapOutputMapFn(wt.getSerializationClass(), wt.getOutputMapFn()), ptype);
+ }
+
+ public static <T> PType<T> jsons(Class<T> clazz) {
+ return PTypes.jsonString(clazz, WritableTypeFamily.getInstance());
+ }
+
+ // Not instantiable
+ private Writables() {
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/types/writable/package-info.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/types/writable/package-info.java b/crunch-core/src/main/java/org/apache/crunch/types/writable/package-info.java
new file mode 100644
index 0000000..7d54743
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/types/writable/package-info.java
@@ -0,0 +1,22 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Business object serialization using Hadoop's Writables framework.
+ */
+package org.apache.crunch.types.writable;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/util/CrunchTool.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/util/CrunchTool.java b/crunch-core/src/main/java/org/apache/crunch/util/CrunchTool.java
new file mode 100644
index 0000000..ea66291
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/util/CrunchTool.java
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.util;
+
+import java.io.Serializable;
+
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.PipelineExecution;
+import org.apache.crunch.PipelineResult;
+import org.apache.crunch.Source;
+import org.apache.crunch.TableSource;
+import org.apache.crunch.Target;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.io.At;
+import org.apache.crunch.io.From;
+import org.apache.crunch.io.To;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.Tool;
+
+/**
+ * An extension of the {@code Tool} interface that creates a {@code Pipeline}
+ * instance and provides methods for working with the Pipeline from inside of
+ * the Tool's run method.
+ *
+ */
+public abstract class CrunchTool extends Configured implements Tool, Serializable {
+
+ protected static final From from = new From();
+ protected static final To to = new To();
+ protected static final At at = new At();
+
+ // Pipeline object itself isn't necessarily serializable.
+ private transient Pipeline pipeline;
+
+ public CrunchTool() {
+ this(false);
+ }
+
+ public CrunchTool(boolean inMemory) {
+ this.pipeline = inMemory ? MemPipeline.getInstance() : new MRPipeline(getClass());
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ if (conf != null && pipeline != null) {
+ pipeline.setConfiguration(conf);
+ }
+ }
+
+ @Override
+ public Configuration getConf() {
+ return pipeline.getConfiguration();
+ }
+
+ public void enableDebug() {
+ pipeline.enableDebug();
+ }
+
+ public <T> PCollection<T> read(Source<T> source) {
+ return pipeline.read(source);
+ }
+
+ public <K, V> PTable<K, V> read(TableSource<K, V> tableSource) {
+ return pipeline.read(tableSource);
+ }
+
+ public PCollection<String> readTextFile(String pathName) {
+ return pipeline.readTextFile(pathName);
+ }
+
+ public void write(PCollection<?> pcollection, Target target) {
+ pipeline.write(pcollection, target);
+ }
+
+ public void writeTextFile(PCollection<?> pcollection, String pathName) {
+ pipeline.writeTextFile(pcollection, pathName);
+ }
+
+ public <T> Iterable<T> materialize(PCollection<T> pcollection) {
+ return pipeline.materialize(pcollection);
+ }
+
+ public PipelineResult run() {
+ return pipeline.run();
+ }
+
+ public PipelineExecution runAsync() {
+ return pipeline.runAsync();
+ }
+
+ public PipelineResult done() {
+ return pipeline.done();
+ }
+
+ protected Pipeline getPipeline() {
+ return pipeline;
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/util/DistCache.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/util/DistCache.java b/crunch-core/src/main/java/org/apache/crunch/util/DistCache.java
new file mode 100644
index 0000000..3e49930
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/util/DistCache.java
@@ -0,0 +1,231 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.net.URI;
+import java.net.URL;
+import java.net.URLDecoder;
+import java.util.Enumeration;
+
+import org.apache.crunch.CrunchRuntimeException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * Provides functions for working with Hadoop's distributed cache. These
+ * include:
+ * <ul>
+ * <li>
+ * Functions for working with a job-specific distributed cache of objects, like
+ * the serialized runtime nodes in a MapReduce.</li>
+ * <li>
+ * Functions for adding library jars to the distributed cache, which will be
+ * added to the classpath of MapReduce tasks.</li>
+ * </ul>
+ */
+public class DistCache {
+
+ // Configuration key holding the paths of jars to export to the distributed
+ // cache.
+ private static final String TMPJARS_KEY = "tmpjars";
+
+ public static void write(Configuration conf, Path path, Object value) throws IOException {
+ ObjectOutputStream oos = new ObjectOutputStream(path.getFileSystem(conf).create(path));
+ oos.writeObject(value);
+ oos.close();
+
+ DistributedCache.addCacheFile(path.toUri(), conf);
+ }
+
+ public static Object read(Configuration conf, Path path) throws IOException {
+ URI target = null;
+ for (URI uri : DistributedCache.getCacheFiles(conf)) {
+ if (uri.toString().equals(path.toString())) {
+ target = uri;
+ break;
+ }
+ }
+ Object value = null;
+ if (target != null) {
+ Path targetPath = new Path(target.toString());
+ ObjectInputStream ois = new ObjectInputStream(targetPath.getFileSystem(conf).open(targetPath));
+ try {
+ value = ois.readObject();
+ } catch (ClassNotFoundException e) {
+ throw new CrunchRuntimeException(e);
+ }
+ ois.close();
+ }
+ return value;
+ }
+
+ public static void addCacheFile(Path path, Configuration conf) {
+ DistributedCache.addCacheFile(path.toUri(), conf);
+ }
+
+ public static Path getPathToCacheFile(Path path, Configuration conf) {
+ try {
+ for (Path localPath : DistributedCache.getLocalCacheFiles(conf)) {
+ if (localPath.toString().endsWith(path.getName())) {
+ return localPath.makeQualified(FileSystem.getLocal(conf));
+ }
+ }
+ } catch (IOException e) {
+ throw new CrunchRuntimeException(e);
+ }
+ return null;
+ }
+
+ /**
+ * Adds the specified jar to the distributed cache of jobs using the provided
+ * configuration. The jar will be placed on the classpath of tasks run by the
+ * job.
+ *
+ * @param conf
+ * The configuration used to add the jar to the distributed cache.
+ * @param jarFile
+ * The jar file to add to the distributed cache.
+ * @throws IOException
+ * If the jar file does not exist or there is a problem accessing
+ * the file.
+ */
+ public static void addJarToDistributedCache(Configuration conf, File jarFile) throws IOException {
+ if (!jarFile.exists()) {
+ throw new IOException("Jar file: " + jarFile.getCanonicalPath() + " does not exist.");
+ }
+ if (!jarFile.getName().endsWith(".jar")) {
+ throw new IllegalArgumentException("File: " + jarFile.getCanonicalPath() + " is not a .jar " + "file.");
+ }
+ // Get a qualified path for the jar.
+ FileSystem fileSystem = FileSystem.getLocal(conf);
+ Path jarPath = new Path(jarFile.getCanonicalPath());
+ String qualifiedPath = jarPath.makeQualified(fileSystem).toString();
+ // Add the jar to the configuration variable.
+ String jarConfiguration = conf.get(TMPJARS_KEY, "");
+ if (!jarConfiguration.isEmpty()) {
+ jarConfiguration += ",";
+ }
+ jarConfiguration += qualifiedPath;
+ conf.set(TMPJARS_KEY, jarConfiguration);
+ }
+
+ /**
+ * Adds the jar at the specified path to the distributed cache of jobs using
+ * the provided configuration. The jar will be placed on the classpath of
+ * tasks run by the job.
+ *
+ * @param conf
+ * The configuration used to add the jar to the distributed cache.
+ * @param jarFile
+ * The path to the jar file to add to the distributed cache.
+ * @throws IOException
+ * If the jar file does not exist or there is a problem accessing
+ * the file.
+ */
+ public static void addJarToDistributedCache(Configuration conf, String jarFile) throws IOException {
+ addJarToDistributedCache(conf, new File(jarFile));
+ }
+
+ /**
+ * Finds the path to a jar that contains the class provided, if any. There is
+ * no guarantee that the jar returned will be the first on the classpath to
+ * contain the file. This method is basically lifted out of Hadoop's
+ * {@link org.apache.hadoop.mapred.JobConf} class.
+ *
+ * @param jarClass
+ * The class the jar file should contain.
+ * @return The path to a jar file that contains the class, or
+ * <code>null</code> if no such jar exists.
+ * @throws IOException
+ * If there is a problem searching for the jar file.
+ */
+ public static String findContainingJar(Class<?> jarClass) throws IOException {
+ ClassLoader loader = jarClass.getClassLoader();
+ String classFile = jarClass.getName().replaceAll("\\.", "/") + ".class";
+ for (Enumeration<URL> itr = loader.getResources(classFile); itr.hasMoreElements();) {
+ URL url = itr.nextElement();
+ if ("jar".equals(url.getProtocol())) {
+ String toReturn = url.getPath();
+ if (toReturn.startsWith("file:")) {
+ toReturn = toReturn.substring("file:".length());
+ }
+ // URLDecoder is a misnamed class, since it actually decodes
+ // x-www-form-urlencoded MIME type rather than actual
+ // URL encoding (which the file path has). Therefore it would
+ // decode +s to ' 's which is incorrect (spaces are actually
+ // either unencoded or encoded as "%20"). Replace +s first, so
+ // that they are kept sacred during the decoding process.
+ toReturn = toReturn.replaceAll("\\+", "%2B");
+ toReturn = URLDecoder.decode(toReturn, "UTF-8");
+ return toReturn.replaceAll("!.*$", "");
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Adds all jars under the specified directory to the distributed cache of
+ * jobs using the provided configuration. The jars will be placed on the
+ * classpath of tasks run by the job. This method does not descend into
+ * subdirectories when adding jars.
+ *
+ * @param conf
+ * The configuration used to add jars to the distributed cache.
+ * @param jarDirectory
+ * A directory containing jar files to add to the distributed cache.
+ * @throws IOException
+ * If the directory does not exist or there is a problem accessing
+ * the directory.
+ */
+ public static void addJarDirToDistributedCache(Configuration conf, File jarDirectory) throws IOException {
+ if (!jarDirectory.exists() || !jarDirectory.isDirectory()) {
+ throw new IOException("Jar directory: " + jarDirectory.getCanonicalPath() + " does not "
+ + "exist or is not a directory.");
+ }
+ for (File file : jarDirectory.listFiles()) {
+ if (!file.isDirectory() && file.getName().endsWith(".jar")) {
+ addJarToDistributedCache(conf, file);
+ }
+ }
+ }
+
+ /**
+ * Adds all jars under the directory at the specified path to the distributed
+ * cache of jobs using the provided configuration. The jars will be placed on
+ * the classpath of the tasks run by the job. This method does not descend
+ * into subdirectories when adding jars.
+ *
+ * @param conf
+ * The configuration used to add jars to the distributed cache.
+ * @param jarDirectory
+ * The path to a directory containing jar files to add to the
+ * distributed cache.
+ * @throws IOException
+ * If the directory does not exist or there is a problem accessing
+ * the directory.
+ */
+ public static void addJarDirToDistributedCache(Configuration conf, String jarDirectory) throws IOException {
+ addJarDirToDistributedCache(conf, new File(jarDirectory));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/util/PartitionUtils.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/util/PartitionUtils.java b/crunch-core/src/main/java/org/apache/crunch/util/PartitionUtils.java
new file mode 100644
index 0000000..da8db6b
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/util/PartitionUtils.java
@@ -0,0 +1,34 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.util;
+
+import org.apache.crunch.PCollection;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ *
+ */
+public class PartitionUtils {
+ public static final String BYTES_PER_REDUCE_TASK = "crunch.bytes.per.reduce.task";
+ public static final long DEFAULT_BYTES_PER_REDUCE_TASK = 1000L * 1000L * 1000L;
+
+ public static <T> int getRecommendedPartitions(PCollection<T> pcollection, Configuration conf) {
+ long bytesPerTask = conf.getLong(BYTES_PER_REDUCE_TASK, DEFAULT_BYTES_PER_REDUCE_TASK);
+ return 1 + (int) (pcollection.getSize() / bytesPerTask);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/util/Tuples.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/util/Tuples.java b/crunch-core/src/main/java/org/apache/crunch/util/Tuples.java
new file mode 100644
index 0000000..9c8d7bd
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/util/Tuples.java
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.util;
+
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.crunch.Pair;
+import org.apache.crunch.Tuple3;
+import org.apache.crunch.Tuple4;
+import org.apache.crunch.TupleN;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.UnmodifiableIterator;
+
+/**
+ * Utilities for working with subclasses of the {@code Tuple} interface.
+ *
+ */
+public class Tuples {
+
+ private static abstract class TuplifyIterator<T> extends UnmodifiableIterator<T> {
+ protected List<Iterator<?>> iterators;
+
+ public TuplifyIterator(Iterator<?>... iterators) {
+ this.iterators = Lists.newArrayList(iterators);
+ }
+
+ @Override
+ public boolean hasNext() {
+ for (Iterator<?> iter : iterators) {
+ if (!iter.hasNext()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ protected Object next(int index) {
+ return iterators.get(index).next();
+ }
+ }
+
+ public static class PairIterable<S, T> implements Iterable<Pair<S, T>> {
+ private final Iterable<S> first;
+ private final Iterable<T> second;
+
+ public PairIterable(Iterable<S> first, Iterable<T> second) {
+ this.first = first;
+ this.second = second;
+ }
+
+ @Override
+ public Iterator<Pair<S, T>> iterator() {
+ return new TuplifyIterator<Pair<S, T>>(first.iterator(), second.iterator()) {
+ @Override
+ public Pair<S, T> next() {
+ return Pair.of((S) next(0), (T) next(1));
+ }
+ };
+ }
+ }
+
+ public static class TripIterable<A, B, C> implements Iterable<Tuple3<A, B, C>> {
+ private final Iterable<A> first;
+ private final Iterable<B> second;
+ private final Iterable<C> third;
+
+ public TripIterable(Iterable<A> first, Iterable<B> second, Iterable<C> third) {
+ this.first = first;
+ this.second = second;
+ this.third = third;
+ }
+
+ @Override
+ public Iterator<Tuple3<A, B, C>> iterator() {
+ return new TuplifyIterator<Tuple3<A, B, C>>(first.iterator(), second.iterator(), third.iterator()) {
+ @Override
+ public Tuple3<A, B, C> next() {
+ return new Tuple3<A, B, C>((A) next(0), (B) next(1), (C) next(2));
+ }
+ };
+ }
+ }
+
+ public static class QuadIterable<A, B, C, D> implements Iterable<Tuple4<A, B, C, D>> {
+ private final Iterable<A> first;
+ private final Iterable<B> second;
+ private final Iterable<C> third;
+ private final Iterable<D> fourth;
+
+ public QuadIterable(Iterable<A> first, Iterable<B> second, Iterable<C> third, Iterable<D> fourth) {
+ this.first = first;
+ this.second = second;
+ this.third = third;
+ this.fourth = fourth;
+ }
+
+ @Override
+ public Iterator<Tuple4<A, B, C, D>> iterator() {
+ return new TuplifyIterator<Tuple4<A, B, C, D>>(first.iterator(), second.iterator(), third.iterator(),
+ fourth.iterator()) {
+ @Override
+ public Tuple4<A, B, C, D> next() {
+ return new Tuple4<A, B, C, D>((A) next(0), (B) next(1), (C) next(2), (D) next(3));
+ }
+ };
+ }
+ }
+
+ public static class TupleNIterable implements Iterable<TupleN> {
+ private final Iterator<?>[] iters;
+
+ public TupleNIterable(Iterable<?>... iterables) {
+ this.iters = new Iterator[iterables.length];
+ for (int i = 0; i < iters.length; i++) {
+ iters[i] = iterables[i].iterator();
+ }
+ }
+
+ @Override
+ public Iterator<TupleN> iterator() {
+ return new TuplifyIterator<TupleN>(iters) {
+ @Override
+ public TupleN next() {
+ Object[] values = new Object[iters.length];
+ for (int i = 0; i < values.length; i++) {
+ values[i] = next(i);
+ }
+ return new TupleN(values);
+ }
+ };
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/java/org/apache/crunch/util/package-info.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/java/org/apache/crunch/util/package-info.java b/crunch-core/src/main/java/org/apache/crunch/util/package-info.java
new file mode 100644
index 0000000..94d79a1
--- /dev/null
+++ b/crunch-core/src/main/java/org/apache/crunch/util/package-info.java
@@ -0,0 +1,22 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * An assorted set of utilities.
+ */
+package org.apache.crunch.util;
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/main/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/crunch-core/src/main/resources/log4j.properties b/crunch-core/src/main/resources/log4j.properties
new file mode 100644
index 0000000..506b527
--- /dev/null
+++ b/crunch-core/src/main/resources/log4j.properties
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ***** Set root logger level to INFO and its only appender to A.
+log4j.logger.org.apache.crunch=info, A
+
+# ***** A is set to be a ConsoleAppender.
+log4j.appender.A=org.apache.log4j.ConsoleAppender
+# ***** A uses PatternLayout.
+log4j.appender.A.layout=org.apache.log4j.PatternLayout
+log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/site/site.xml
----------------------------------------------------------------------
diff --git a/crunch-core/src/site/site.xml b/crunch-core/src/site/site.xml
new file mode 100644
index 0000000..73fbd17
--- /dev/null
+++ b/crunch-core/src/site/site.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="${project.name}"
+ xmlns="http://maven.apache.org/DECORATION/1.3.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/DECORATION/1.3.0
+ http://maven.apache.org/xsd/decoration-1.3.0.xsd">
+
+ <body>
+ <!-- Note: Breadcrumbs for Doxia's Markdown parser are currently broken,
+ see https://jira.codehaus.org/browse/DOXIA-472 -->
+ <breadcrumbs>
+ <item name="Apache" href="http://www.apache.org/index.html" />
+ <item name="Crunch" href="../index.html"/>
+ </breadcrumbs>
+
+ </body>
+
+</project>
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/avro/employee.avsc
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/avro/employee.avsc b/crunch-core/src/test/avro/employee.avsc
new file mode 100644
index 0000000..35726e1
--- /dev/null
+++ b/crunch-core/src/test/avro/employee.avsc
@@ -0,0 +1,26 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+{
+"namespace": "org.apache.crunch.test",
+"name": "Employee",
+"type": "record",
+"fields": [
+ {"name": "name", "type": ["string", "null"] },
+ {"name": "salary", "type": "int"},
+ {"name": "department", "type": ["string", "null"] } ]
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/avro/person.avsc
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/avro/person.avsc b/crunch-core/src/test/avro/person.avsc
new file mode 100644
index 0000000..babd808
--- /dev/null
+++ b/crunch-core/src/test/avro/person.avsc
@@ -0,0 +1,26 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+{
+"namespace": "org.apache.crunch.test",
+"name": "Person",
+"type": "record",
+"fields": [
+ {"name": "name", "type": ["string", "null"] },
+ {"name": "age", "type": "int"},
+ {"name": "siblingnames", "type": {"type": "array", "items": "string"}} ]
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/AndFnTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/AndFnTest.java b/crunch-core/src/test/java/org/apache/crunch/AndFnTest.java
new file mode 100644
index 0000000..4b00874
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/AndFnTest.java
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+import org.apache.crunch.FilterFn.AndFn;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+import org.junit.Before;
+import org.junit.Test;
+
+public class AndFnTest {
+
+ private FilterFn<Integer> fnA;
+ private FilterFn<Integer> fnB;
+ private AndFn<Integer> andFn;
+
+ @Before
+ public void setUp() {
+ fnA = mock(FilterFn.class);
+ fnB = mock(FilterFn.class);
+ andFn = new AndFn(fnA, fnB);
+ }
+
+ @Test
+ public void testSetContext() {
+ TaskInputOutputContext<?, ?, ?, ?> context = mock(TaskInputOutputContext.class);
+ andFn.setContext(context);
+
+ verify(fnA).setContext(context);
+ verify(fnB).setContext(context);
+ }
+
+ @Test
+ public void testAccept_False() {
+ when(fnA.accept(1)).thenReturn(true);
+ when(fnB.accept(1)).thenReturn(false);
+
+ assertFalse(andFn.accept(1));
+ }
+
+ @Test
+ public void testAccept_True() {
+ when(fnA.accept(1)).thenReturn(true);
+ when(fnB.accept(1)).thenReturn(true);
+
+ assertTrue(andFn.accept(1));
+ }
+
+ @Test
+ public void testCleanup() {
+ andFn.cleanup(mock(Emitter.class));
+
+ verify(fnA).cleanup();
+ verify(fnB).cleanup();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/CombineFnTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/CombineFnTest.java b/crunch-core/src/test/java/org/apache/crunch/CombineFnTest.java
new file mode 100644
index 0000000..39548e2
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/CombineFnTest.java
@@ -0,0 +1,222 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.apache.crunch.CombineFn.MAX_BIGINTS;
+import static org.apache.crunch.CombineFn.MAX_DOUBLES;
+import static org.apache.crunch.CombineFn.MAX_FLOATS;
+import static org.apache.crunch.CombineFn.MAX_INTS;
+import static org.apache.crunch.CombineFn.MAX_LONGS;
+import static org.apache.crunch.CombineFn.MIN_BIGINTS;
+import static org.apache.crunch.CombineFn.MIN_DOUBLES;
+import static org.apache.crunch.CombineFn.MIN_FLOATS;
+import static org.apache.crunch.CombineFn.MIN_INTS;
+import static org.apache.crunch.CombineFn.MIN_LONGS;
+import static org.apache.crunch.CombineFn.SUM_BIGINTS;
+import static org.apache.crunch.CombineFn.SUM_DOUBLES;
+import static org.apache.crunch.CombineFn.SUM_FLOATS;
+import static org.apache.crunch.CombineFn.SUM_INTS;
+import static org.apache.crunch.CombineFn.SUM_LONGS;
+import static org.junit.Assert.assertEquals;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.crunch.CombineFn.Aggregator;
+import org.apache.crunch.CombineFn.AggregatorFactory;
+import org.apache.crunch.CombineFn.FirstNAggregator;
+import org.apache.crunch.CombineFn.LastNAggregator;
+import org.apache.crunch.CombineFn.MaxNAggregator;
+import org.apache.crunch.CombineFn.MinNAggregator;
+import org.apache.crunch.CombineFn.PairAggregator;
+import org.apache.crunch.CombineFn.QuadAggregator;
+import org.apache.crunch.CombineFn.StringConcatAggregator;
+import org.apache.crunch.CombineFn.TripAggregator;
+import org.apache.crunch.CombineFn.TupleNAggregator;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+
+public class CombineFnTest {
+
+ private <T> Iterable<T> applyAggregator(AggregatorFactory<T> a, Iterable<T> values) {
+ return applyAggregator(a.create(), values);
+ }
+
+ private <T> Iterable<T> applyAggregator(Aggregator<T> a, Iterable<T> values) {
+ a.reset();
+ for (T value : values) {
+ a.update(value);
+ }
+ return a.results();
+ }
+
+ @Test
+ public void testSums() {
+ assertEquals(ImmutableList.of(1775L), applyAggregator(SUM_LONGS, ImmutableList.of(29L, 17L, 1729L)));
+
+ assertEquals(ImmutableList.of(1765L), applyAggregator(SUM_LONGS, ImmutableList.of(29L, 7L, 1729L)));
+
+ assertEquals(ImmutableList.of(1775), applyAggregator(SUM_INTS, ImmutableList.of(29, 17, 1729)));
+
+ assertEquals(ImmutableList.of(1775.0f), applyAggregator(SUM_FLOATS, ImmutableList.of(29f, 17f, 1729f)));
+
+ assertEquals(ImmutableList.of(1775.0), applyAggregator(SUM_DOUBLES, ImmutableList.of(29.0, 17.0, 1729.0)));
+
+ assertEquals(
+ ImmutableList.of(new BigInteger("1775")),
+ applyAggregator(SUM_BIGINTS,
+ ImmutableList.of(new BigInteger("29"), new BigInteger("17"), new BigInteger("1729"))));
+ }
+
+ @Test
+ public void testMax() {
+ assertEquals(ImmutableList.of(1729L), applyAggregator(MAX_LONGS, ImmutableList.of(29L, 17L, 1729L)));
+
+ assertEquals(ImmutableList.of(1729), applyAggregator(MAX_INTS, ImmutableList.of(29, 17, 1729)));
+
+ assertEquals(ImmutableList.of(1729.0f), applyAggregator(MAX_FLOATS, ImmutableList.of(29f, 17f, 1729f)));
+
+ assertEquals(ImmutableList.of(1729.0), applyAggregator(MAX_DOUBLES, ImmutableList.of(29.0, 17.0, 1729.0)));
+
+ assertEquals(ImmutableList.of(1745.0f), applyAggregator(MAX_FLOATS, ImmutableList.of(29f, 1745f, 17f, 1729f)));
+
+ assertEquals(
+ ImmutableList.of(new BigInteger("1729")),
+ applyAggregator(MAX_BIGINTS,
+ ImmutableList.of(new BigInteger("29"), new BigInteger("17"), new BigInteger("1729"))));
+ }
+
+ @Test
+ public void testMin() {
+ assertEquals(ImmutableList.of(17L), applyAggregator(MIN_LONGS, ImmutableList.of(29L, 17L, 1729L)));
+
+ assertEquals(ImmutableList.of(17), applyAggregator(MIN_INTS, ImmutableList.of(29, 17, 1729)));
+
+ assertEquals(ImmutableList.of(17.0f), applyAggregator(MIN_FLOATS, ImmutableList.of(29f, 17f, 1729f)));
+
+ assertEquals(ImmutableList.of(17.0), applyAggregator(MIN_DOUBLES, ImmutableList.of(29.0, 17.0, 1729.0)));
+
+ assertEquals(ImmutableList.of(29), applyAggregator(MIN_INTS, ImmutableList.of(29, 170, 1729)));
+
+ assertEquals(
+ ImmutableList.of(new BigInteger("17")),
+ applyAggregator(MIN_BIGINTS,
+ ImmutableList.of(new BigInteger("29"), new BigInteger("17"), new BigInteger("1729"))));
+ }
+
+ @Test
+ public void testMaxN() {
+ assertEquals(ImmutableList.of(98, 1009),
+ applyAggregator(new MaxNAggregator<Integer>(2), ImmutableList.of(17, 34, 98, 29, 1009)));
+ }
+
+ @Test
+ public void testMinN() {
+ assertEquals(ImmutableList.of(17, 29),
+ applyAggregator(new MinNAggregator<Integer>(2), ImmutableList.of(17, 34, 98, 29, 1009)));
+ }
+
+ @Test
+ public void testFirstN() {
+ assertEquals(ImmutableList.of(17, 34),
+ applyAggregator(new FirstNAggregator<Integer>(2), ImmutableList.of(17, 34, 98, 29, 1009)));
+ }
+
+ @Test
+ public void testLastN() {
+ assertEquals(ImmutableList.of(29, 1009),
+ applyAggregator(new LastNAggregator<Integer>(2), ImmutableList.of(17, 34, 98, 29, 1009)));
+ }
+
+ @Test
+ public void testPairs() {
+ List<Pair<Long, Double>> input = ImmutableList.of(Pair.of(1720L, 17.29), Pair.of(9L, -3.14));
+ Aggregator<Pair<Long, Double>> a = new PairAggregator<Long, Double>(SUM_LONGS.create(), MIN_DOUBLES.create());
+ assertEquals(Pair.of(1729L, -3.14), Iterables.getOnlyElement(applyAggregator(a, input)));
+ }
+
+ @Test
+ public void testPairsTwoLongs() {
+ List<Pair<Long, Long>> input = ImmutableList.of(Pair.of(1720L, 1L), Pair.of(9L, 19L));
+ Aggregator<Pair<Long, Long>> a = new PairAggregator<Long, Long>(SUM_LONGS.create(), SUM_LONGS.create());
+ assertEquals(Pair.of(1729L, 20L), Iterables.getOnlyElement(applyAggregator(a, input)));
+ }
+
+ @Test
+ public void testTrips() {
+ List<Tuple3<Float, Double, Double>> input = ImmutableList.of(Tuple3.of(17.29f, 12.2, 0.1),
+ Tuple3.of(3.0f, 1.2, 3.14), Tuple3.of(-1.0f, 14.5, -0.98));
+ Aggregator<Tuple3<Float, Double, Double>> a = new TripAggregator<Float, Double, Double>(MAX_FLOATS.create(),
+ MAX_DOUBLES.create(), MIN_DOUBLES.create());
+ assertEquals(Tuple3.of(17.29f, 14.5, -0.98), Iterables.getOnlyElement(applyAggregator(a, input)));
+ }
+
+ @Test
+ public void testQuads() {
+ List<Tuple4<Float, Double, Double, Integer>> input = ImmutableList.of(Tuple4.of(17.29f, 12.2, 0.1, 1),
+ Tuple4.of(3.0f, 1.2, 3.14, 2), Tuple4.of(-1.0f, 14.5, -0.98, 3));
+ Aggregator<Tuple4<Float, Double, Double, Integer>> a = new QuadAggregator<Float, Double, Double, Integer>(
+ MAX_FLOATS.create(), MAX_DOUBLES.create(), MIN_DOUBLES.create(), SUM_INTS.create());
+ assertEquals(Tuple4.of(17.29f, 14.5, -0.98, 6), Iterables.getOnlyElement(applyAggregator(a, input)));
+ }
+
+ @Test
+ public void testTupleN() {
+ List<TupleN> input = ImmutableList.of(new TupleN(1, 3.0, 1, 2.0, 4L), new TupleN(4, 17.0, 1, 9.7, 12L));
+ Aggregator<TupleN> a = new TupleNAggregator(MIN_INTS.create(), SUM_DOUBLES.create(), MAX_INTS.create(),
+ MIN_DOUBLES.create(), MAX_LONGS.create());
+ assertEquals(new TupleN(1, 20.0, 1, 2.0, 12L), Iterables.getOnlyElement(applyAggregator(a, input)));
+ }
+
+ @Test
+ public void testConcatenation() {
+ String[] arrayNull = new String[] { null, "" };
+ assertEquals(ImmutableList.of("foofoobarbar"), applyAggregator(
+ new StringConcatAggregator("", true), ImmutableList.of("foo", "foobar", "bar")));
+ assertEquals(ImmutableList.of("foo/foobar/bar"), applyAggregator(
+ new StringConcatAggregator("/", false), ImmutableList.of("foo", "foobar", "bar")));
+ assertEquals(ImmutableList.of(" "), applyAggregator(
+ new StringConcatAggregator(" ", true), ImmutableList.of(" ", "")));
+ assertEquals(ImmutableList.of(""), applyAggregator(
+ new StringConcatAggregator(" ", true), Arrays.asList(arrayNull)));
+ assertEquals(ImmutableList.of("foo bar"), applyAggregator(
+ new StringConcatAggregator(" ", true, 20, 3), ImmutableList.of("foo", "foobar", "bar")));
+ assertEquals(ImmutableList.of("foo foobar"), applyAggregator(
+ new StringConcatAggregator(" ", true, 10, 6), ImmutableList.of("foo", "foobar", "bar")));
+ assertEquals(ImmutableList.of("foo bar"), applyAggregator(
+ new StringConcatAggregator(" ", true, 9, 6), ImmutableList.of("foo", "foobar", "bar")));
+ }
+
+ @Test
+ public void testConcatenationReset() {
+ StringConcatAggregator a = new StringConcatAggregator(" ", true, 10, 6);
+
+ assertEquals(ImmutableList.of("foo foobar"), applyAggregator(a, ImmutableList.of("foo", "foobar", "bar")));
+ assertEquals(ImmutableList.of("foo foobar"), applyAggregator(a, ImmutableList.of("foo", "foobar", "bar")));
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void testConcatenationNullException() {
+ String[] arrayNull = new String[] { null, "" };
+ assertEquals(ImmutableList.of(""), applyAggregator(
+ new StringConcatAggregator(" ", false), Arrays.asList(arrayNull)));
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/NotFnTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/NotFnTest.java b/crunch-core/src/test/java/org/apache/crunch/NotFnTest.java
new file mode 100644
index 0000000..8af17a2
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/NotFnTest.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.*;
+import static org.junit.Assert.fail;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+import org.apache.crunch.FilterFn.NotFn;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+import org.junit.Before;
+import org.junit.Test;
+
+public class NotFnTest {
+
+ private FilterFn<Integer> base;
+ private NotFn<Integer> notFn;
+
+ @Before
+ public void setUp() {
+ base = mock(FilterFn.class);
+ notFn = new NotFn(base);
+ }
+
+ @Test
+ public void testSetContext() {
+ TaskInputOutputContext<?, ?, ?, ?> context = mock(TaskInputOutputContext.class);
+
+ notFn.setContext(context);
+
+ verify(base).setContext(context);
+ }
+
+ @Test
+ public void testAccept_True() {
+ when(base.accept(1)).thenReturn(true);
+
+ assertFalse(notFn.accept(1));
+ }
+
+ @Test
+ public void testAccept_False() {
+ when(base.accept(1)).thenReturn(false);
+
+ assertTrue(notFn.accept(1));
+ }
+
+ @Test
+ public void testCleanupEmitterOfT() {
+ notFn.cleanup(mock(Emitter.class));
+
+ verify(base).cleanup();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/OrFnTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/OrFnTest.java b/crunch-core/src/test/java/org/apache/crunch/OrFnTest.java
new file mode 100644
index 0000000..fde2376
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/OrFnTest.java
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+import org.apache.crunch.FilterFn.OrFn;
+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
+import org.junit.Before;
+import org.junit.Test;
+
+public class OrFnTest {
+
+ private FilterFn<Integer> fnA;
+ private FilterFn<Integer> fnB;
+ private OrFn<Integer> orFn;
+
+ @Before
+ public void setUp() {
+ fnA = mock(FilterFn.class);
+ fnB = mock(FilterFn.class);
+ orFn = new OrFn(fnA, fnB);
+ }
+
+ @Test
+ public void testSetContext() {
+ TaskInputOutputContext<?, ?, ?, ?> context = mock(TaskInputOutputContext.class);
+
+ orFn.setContext(context);
+
+ verify(fnA).setContext(context);
+ verify(fnB).setContext(context);
+ }
+
+ @Test
+ public void testAccept_True() {
+ when(fnA.accept(1)).thenReturn(false);
+ when(fnB.accept(1)).thenReturn(true);
+
+ assertTrue(orFn.accept(1));
+ }
+
+ @Test
+ public void testAccept_False() {
+ when(fnA.accept(1)).thenReturn(false);
+ when(fnB.accept(1)).thenReturn(false);
+
+ assertFalse(orFn.accept(1));
+ }
+
+ @Test
+ public void testCleanupEmitterOfT() {
+ orFn.cleanup(mock(Emitter.class));
+
+ verify(fnA).cleanup();
+ verify(fnB).cleanup();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/PairTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/PairTest.java b/crunch-core/src/test/java/org/apache/crunch/PairTest.java
new file mode 100644
index 0000000..106413c
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/PairTest.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import org.junit.Test;
+
+public class PairTest {
+
+ @Test
+ public void testPairConstructor() {
+ Pair<String, Integer> pair = new Pair<String, Integer>("brock", 45);
+ test(pair);
+ }
+
+ @Test
+ public void testPairOf() {
+ Pair<String, Integer> pair = Pair.of("brock", 45);
+ test(pair);
+ }
+
+ protected void test(Pair<String, Integer> pair) {
+ assertTrue(pair.size() == 2);
+
+ assertEquals("brock", pair.first());
+ assertEquals(new Integer(45), pair.second());
+ assertEquals(Pair.of("brock", 45), pair);
+
+ assertEquals("brock", pair.get(0));
+ assertEquals(new Integer(45), pair.get(1));
+
+ try {
+ pair.get(-1);
+ fail();
+ } catch (IndexOutOfBoundsException e) {
+ // expected
+ }
+ }
+
+ @Test
+ public void testPairComparisons() {
+ assertEquals(0, Pair.of(null, null).compareTo(Pair.of(null, null)));
+ assertEquals(0, Pair.of(1, 2).compareTo(Pair.of(1, 2)));
+ assertTrue(Pair.of(2, "a").compareTo(Pair.of(1, "a")) > 0);
+ assertTrue(Pair.of("a", 2).compareTo(Pair.of("a", 1)) > 0);
+ assertTrue(Pair.of(null, 17).compareTo(Pair.of(null, 29)) < 0);
+ }
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/TupleTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/TupleTest.java b/crunch-core/src/test/java/org/apache/crunch/TupleTest.java
new file mode 100644
index 0000000..b07ec3f
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/TupleTest.java
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import org.apache.crunch.types.TupleFactory;
+import org.junit.Test;
+
+public class TupleTest {
+ private String first = "foo";
+ private Integer second = 1729;
+ private Double third = 64.2;
+ private Boolean fourth = false;
+ private Float fifth = 17.29f;
+
+ @Test
+ public void testTuple3() {
+ Tuple3<String, Integer, Double> t = new Tuple3<String, Integer, Double>(first, second, third);
+ assertEquals(3, t.size());
+ assertEquals(first, t.first());
+ assertEquals(second, t.second());
+ assertEquals(third, t.third());
+ assertEquals(first, t.get(0));
+ assertEquals(second, t.get(1));
+ assertEquals(third, t.get(2));
+ try {
+ t.get(-1);
+ fail();
+ } catch (IndexOutOfBoundsException e) {
+ // expected
+ }
+ }
+
+ @Test
+ public void testTuple3Equality() {
+ Tuple3<String, Integer, Double> t = new Tuple3<String, Integer, Double>(first, second, third);
+ assertTrue(t.equals(new Tuple3(first, second, third)));
+ assertFalse(t.equals(new Tuple3(first, null, third)));
+ assertFalse((new Tuple3(null, null, null)).equals(t));
+ assertTrue((new Tuple3(first, null, null)).equals(new Tuple3(first, null, null)));
+ }
+
+ @Test
+ public void testTuple4() {
+ Tuple4<String, Integer, Double, Boolean> t = new Tuple4<String, Integer, Double, Boolean>(first, second, third,
+ fourth);
+ assertEquals(4, t.size());
+ assertEquals(first, t.first());
+ assertEquals(second, t.second());
+ assertEquals(third, t.third());
+ assertEquals(fourth, t.fourth());
+ assertEquals(first, t.get(0));
+ assertEquals(second, t.get(1));
+ assertEquals(third, t.get(2));
+ assertEquals(fourth, t.get(3));
+ try {
+ t.get(-1);
+ fail();
+ } catch (IndexOutOfBoundsException e) {
+ // expected
+ }
+ }
+
+ @Test
+ public void testTuple4Equality() {
+ Tuple4<String, Integer, Double, Boolean> t = new Tuple4<String, Integer, Double, Boolean>(first, second, third,
+ fourth);
+ assertFalse(t.equals(new Tuple3(first, second, third)));
+ assertFalse(t.equals(new Tuple4(first, null, third, null)));
+ assertFalse((new Tuple4(null, null, null, null)).equals(t));
+ assertTrue((new Tuple4(first, null, third, null)).equals(new Tuple4(first, null, third, null)));
+ }
+
+ @Test
+ public void testTupleN() {
+ TupleN t = new TupleN(first, second, third, fourth, fifth);
+ assertEquals(5, t.size());
+ assertEquals(first, t.get(0));
+ assertEquals(second, t.get(1));
+ assertEquals(third, t.get(2));
+ assertEquals(fourth, t.get(3));
+ assertEquals(fifth, t.get(4));
+ try {
+ t.get(-1);
+ fail();
+ } catch (IndexOutOfBoundsException e) {
+ // expected
+ }
+ }
+
+ @Test
+ public void testTupleNEquality() {
+ TupleN t = new TupleN(first, second, third, fourth, fifth);
+ assertTrue(t.equals(new TupleN(first, second, third, fourth, fifth)));
+ assertFalse(t.equals(new TupleN(first, null, third, null)));
+ assertFalse((new TupleN(null, null, null, null, null)).equals(t));
+ assertTrue((new TupleN(first, second, third, null, null)).equals(new TupleN(first, second, third, null, null)));
+ }
+
+ @Test
+ public void testTupleFactory() {
+ checkTuple(TupleFactory.PAIR.makeTuple("a", "b"), Pair.class, "a", "b");
+ checkTuple(TupleFactory.TUPLE3.makeTuple("a", "b", "c"), Tuple3.class, "a", "b", "c");
+ checkTuple(TupleFactory.TUPLE4.makeTuple("a", "b", "c", "d"), Tuple4.class, "a", "b", "c", "d");
+ checkTuple(TupleFactory.TUPLEN.makeTuple("a", "b", "c", "d", "e"), TupleN.class, "a", "b", "c", "d", "e");
+
+ checkTuple(TupleFactory.TUPLEN.makeTuple("a", "b"), TupleN.class, "a", "b");
+ checkTuple(TupleFactory.TUPLEN.makeTuple("a", "b", "c"), TupleN.class, "a", "b", "c");
+ checkTuple(TupleFactory.TUPLEN.makeTuple("a", "b", "c", "d"), TupleN.class, "a", "b", "c", "d");
+ checkTuple(TupleFactory.TUPLEN.makeTuple("a", "b", "c", "d", "e"), TupleN.class, "a", "b", "c", "d", "e");
+ }
+
+ private void checkTuple(Tuple t, Class<? extends Tuple> type, Object... values) {
+ assertEquals(type, t.getClass());
+ assertEquals(values.length, t.size());
+ for (int i = 0; i < values.length; i++)
+ assertEquals(values[i], t.get(i));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/crunch/blob/890e0086/crunch-core/src/test/java/org/apache/crunch/WriteModeTest.java
----------------------------------------------------------------------
diff --git a/crunch-core/src/test/java/org/apache/crunch/WriteModeTest.java b/crunch-core/src/test/java/org/apache/crunch/WriteModeTest.java
new file mode 100644
index 0000000..e99ac7b
--- /dev/null
+++ b/crunch-core/src/test/java/org/apache/crunch/WriteModeTest.java
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.crunch.Target.WriteMode;
+import org.apache.crunch.impl.mem.MemPipeline;
+import org.apache.crunch.io.To;
+import org.apache.crunch.test.TemporaryPath;
+import org.apache.crunch.test.TemporaryPaths;
+import org.apache.crunch.types.avro.Avros;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.junit.Rule;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableList;
+
+public class WriteModeTest {
+
+ @Rule
+ public TemporaryPath tmpDir = TemporaryPaths.create();
+
+ @Test(expected=CrunchRuntimeException.class)
+ public void testDefault() throws Exception {
+ run(null, true);
+ }
+
+ @Test(expected=CrunchRuntimeException.class)
+ public void testDefaultNoRun() throws Exception {
+ run(null, false);
+ }
+
+ @Test
+ public void testOverwrite() throws Exception {
+ Path p = run(WriteMode.OVERWRITE, true);
+ PCollection<String> lines = MemPipeline.getInstance().readTextFile(p.toString());
+ assertEquals(ImmutableList.of("some", "string", "values"), lines.materialize());
+ }
+
+ @Test(expected=CrunchRuntimeException.class)
+ public void testOverwriteNoRun() throws Exception {
+ run(WriteMode.OVERWRITE, false);
+ }
+
+ @Test
+ public void testAppend() throws Exception {
+ Path p = run(WriteMode.APPEND, true);
+ PCollection<String> lines = MemPipeline.getInstance().readTextFile(p.toString());
+ assertEquals(ImmutableList.of("some", "string", "values", "some", "string", "values"),
+ lines.materialize());
+ }
+
+ @Test
+ public void testAppendNoRun() throws Exception {
+ Path p = run(WriteMode.APPEND, false);
+ PCollection<String> lines = MemPipeline.getInstance().readTextFile(p.toString());
+ assertEquals(ImmutableList.of("some", "string", "values", "some", "string", "values"),
+ lines.materialize());
+ }
+
+ Path run(WriteMode writeMode, boolean doRun) throws Exception {
+ Path output = tmpDir.getPath("existing");
+ FileSystem fs = FileSystem.get(tmpDir.getDefaultConfiguration());
+ if (fs.exists(output)) {
+ fs.delete(output, true);
+ }
+ Pipeline p = MemPipeline.getInstance();
+ PCollection<String> data = MemPipeline.typedCollectionOf(Avros.strings(),
+ ImmutableList.of("some", "string", "values"));
+ data.write(To.textFile(output));
+
+ if (doRun) {
+ p.run();
+ }
+
+ if (writeMode == null) {
+ data.write(To.textFile(output));
+ } else {
+ data.write(To.textFile(output), writeMode);
+ }
+
+ p.run();
+
+ return output;
+ }
+}