You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@crunch.apache.org by jw...@apache.org on 2012/08/08 02:47:05 UTC
[2/10] git commit: CRUNCH-32: Clean up namespaces.
CRUNCH-32: Clean up namespaces.
Rename Maven module "scrunch" to "crunch-scrunch".
Change artifactId of "scrunch" to "crunch-scrunch".
Move package "org.apache.scrunch" to "org.apache.crunch.scrunch".
Rename Maven module "examples" to "crunch-examples".
Signed-off-by: jwills <jw...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/incubator-crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-crunch/commit/dfd28922
Tree: http://git-wip-us.apache.org/repos/asf/incubator-crunch/tree/dfd28922
Diff: http://git-wip-us.apache.org/repos/asf/incubator-crunch/diff/dfd28922
Branch: refs/heads/master
Commit: dfd28922d6c5740db3d5170d94db546d22077ba4
Parents: a92a523
Author: Matthias Friedrich <ma...@mafr.de>
Authored: Sun Aug 5 20:03:36 2012 +0200
Committer: jwills <jw...@apache.org>
Committed: Tue Aug 7 09:17:40 2012 -0700
----------------------------------------------------------------------
crunch-examples/pom.xml | 78 +
crunch-examples/src/main/assembly/hadoop-job.xml | 41 +
.../apache/crunch/examples/AverageBytesByIP.java | 134 +
.../org/apache/crunch/examples/TotalBytesByIP.java | 106 +
.../java/org/apache/crunch/examples/WordCount.java | 76 +
crunch-examples/src/site/markdown/index.md | 20 +
crunch-examples/src/site/site.xml | 34 +
crunch-scrunch/build.sbt | 51 +
crunch-scrunch/pom.xml | 199 +
crunch-scrunch/src/it/resources/log4j.properties | 28 +
crunch-scrunch/src/it/resources/maugham.txt |29112 +++++++++++++++
crunch-scrunch/src/it/resources/shakes.txt | 3667 ++
crunch-scrunch/src/it/resources/urls.txt | 11 +
.../org/apache/crunch/scrunch/CogroupTest.scala | 42 +
.../scala/org/apache/crunch/scrunch/JoinTest.scala | 45 +
.../apache/crunch/scrunch/PageRankClassTest.scala | 119 +
.../org/apache/crunch/scrunch/PageRankTest.scala | 105 +
.../apache/crunch/scrunch/PipelineAppTest.scala | 49 +
.../scala/org/apache/crunch/scrunch/TopTest.scala | 42 +
.../org/apache/crunch/scrunch/UnionTest.scala | 52 +
.../org/apache/crunch/scrunch/WordCountTest.scala | 42 +
.../scrunch/interpreter/InterpreterJarTest.scala | 69 +
crunch-scrunch/src/main/assembly/release.xml | 93 +
crunch-scrunch/src/main/conf/log4j.properties | 24 +
.../src/main/examples/ClassyPageRank.scala | 71 +
crunch-scrunch/src/main/examples/PageRank.scala | 61 +
crunch-scrunch/src/main/examples/WordCount.scala | 27 +
.../crunch/scrunch/ScalaReflectDataFactory.java | 41 +
.../crunch/scrunch/ScalaSafeReflectData.java | 292 +
.../scrunch/ScalaSafeReflectDatumReader.java | 122 +
.../scrunch/ScalaSafeReflectDatumWriter.java | 68 +
.../org/apache/crunch/scrunch/Conversions.scala | 147 +
.../apache/crunch/scrunch/EmbeddedPipeline.scala | 47 +
.../crunch/scrunch/EmbeddedPipelineLike.scala | 127 +
.../main/scala/org/apache/crunch/scrunch/IO.scala | 50 +
.../main/scala/org/apache/crunch/scrunch/Mem.scala | 88 +
.../org/apache/crunch/scrunch/PCollection.scala | 118 +
.../apache/crunch/scrunch/PCollectionLike.scala | 48 +
.../org/apache/crunch/scrunch/PGroupedTable.scala | 92 +
.../scala/org/apache/crunch/scrunch/PTable.scala | 191 +
.../org/apache/crunch/scrunch/PTypeFamily.scala | 127 +
.../scala/org/apache/crunch/scrunch/Pipeline.scala | 164 +
.../org/apache/crunch/scrunch/PipelineApp.scala | 64 +
.../org/apache/crunch/scrunch/PipelineHelper.scala | 74 +
.../org/apache/crunch/scrunch/PipelineLike.scala | 91 +
.../scrunch/interpreter/InterpreterRunner.scala | 208 +
crunch-scrunch/src/main/scripts/imports.scala | 19 +
crunch-scrunch/src/main/scripts/scrunch | 163 +
crunch-scrunch/src/main/scripts/scrunch-job.py | 133 +
crunch-scrunch/src/site/markdown/index.md | 20 +
crunch-scrunch/src/site/site.xml | 34 +
examples/pom.xml | 78 -
examples/src/main/assembly/hadoop-job.xml | 41 -
.../apache/crunch/examples/AverageBytesByIP.java | 134 -
.../org/apache/crunch/examples/TotalBytesByIP.java | 106 -
.../java/org/apache/crunch/examples/WordCount.java | 76 -
examples/src/site/markdown/index.md | 20 -
examples/src/site/site.xml | 34 -
pom.xml | 4 +-
scrunch/build.sbt | 51 -
scrunch/pom.xml | 199 -
scrunch/src/it/resources/log4j.properties | 29 -
scrunch/src/it/resources/maugham.txt |29112 ---------------
scrunch/src/it/resources/shakes.txt | 3667 --
scrunch/src/it/resources/urls.txt | 11 -
.../it/scala/org/apache/scrunch/CogroupTest.scala | 42 -
.../src/it/scala/org/apache/scrunch/JoinTest.scala | 45 -
.../org/apache/scrunch/PageRankClassTest.scala | 119 -
.../it/scala/org/apache/scrunch/PageRankTest.scala | 105 -
.../scala/org/apache/scrunch/PipelineAppTest.scala | 49 -
.../src/it/scala/org/apache/scrunch/TopTest.scala | 42 -
.../it/scala/org/apache/scrunch/UnionTest.scala | 52 -
.../scala/org/apache/scrunch/WordCountTest.scala | 42 -
.../scrunch/interpreter/InterpreterJarTest.scala | 69 -
scrunch/src/main/assembly/release.xml | 93 -
scrunch/src/main/conf/log4j.properties | 24 -
scrunch/src/main/examples/ClassyPageRank.scala | 71 -
scrunch/src/main/examples/PageRank.scala | 61 -
scrunch/src/main/examples/WordCount.scala | 27 -
.../apache/scrunch/ScalaReflectDataFactory.java | 41 -
.../org/apache/scrunch/ScalaSafeReflectData.java | 292 -
.../scrunch/ScalaSafeReflectDatumReader.java | 122 -
.../scrunch/ScalaSafeReflectDatumWriter.java | 68 -
.../scala/org/apache/scrunch/Conversions.scala | 147 -
.../org/apache/scrunch/EmbeddedPipeline.scala | 47 -
.../org/apache/scrunch/EmbeddedPipelineLike.scala | 127 -
scrunch/src/main/scala/org/apache/scrunch/IO.scala | 50 -
.../src/main/scala/org/apache/scrunch/Mem.scala | 88 -
.../scala/org/apache/scrunch/PCollection.scala | 118 -
.../scala/org/apache/scrunch/PCollectionLike.scala | 48 -
.../scala/org/apache/scrunch/PGroupedTable.scala | 92 -
.../src/main/scala/org/apache/scrunch/PTable.scala | 191 -
.../scala/org/apache/scrunch/PTypeFamily.scala | 127 -
.../main/scala/org/apache/scrunch/Pipeline.scala | 164 -
.../scala/org/apache/scrunch/PipelineApp.scala | 64 -
.../scala/org/apache/scrunch/PipelineHelper.scala | 74 -
.../scala/org/apache/scrunch/PipelineLike.scala | 91 -
.../scrunch/interpreter/InterpreterRunner.scala | 208 -
scrunch/src/main/scripts/imports.scala | 19 -
scrunch/src/main/scripts/scrunch | 163 -
scrunch/src/main/scripts/scrunch-job.py | 133 -
scrunch/src/site/markdown/index.md | 20 -
scrunch/src/site/site.xml | 34 -
src/site/markdown/scrunch.md | 7 +-
104 files changed, 36931 insertions(+), 36933 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/pom.xml
----------------------------------------------------------------------
diff --git a/crunch-examples/pom.xml b/crunch-examples/pom.xml
new file mode 100644
index 0000000..4b3113f
--- /dev/null
+++ b/crunch-examples/pom.xml
@@ -0,0 +1,78 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.crunch</groupId>
+ <artifactId>crunch-parent</artifactId>
+ <version>0.3.0-incubating-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>crunch-examples</artifactId>
+ <name>Crunch Examples</name>
+
+ <dependencies>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-client</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.crunch</groupId>
+ <artifactId>crunch</artifactId>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <configuration>
+ <descriptors>
+ <descriptor>src/main/assembly/hadoop-job.xml</descriptor>
+ </descriptors>
+ <archive>
+ <manifest>
+ <mainClass>org.apache.crunch.examples.WordCount</mainClass>
+ </manifest>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <id>make-assembly</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/main/assembly/hadoop-job.xml
----------------------------------------------------------------------
diff --git a/crunch-examples/src/main/assembly/hadoop-job.xml b/crunch-examples/src/main/assembly/hadoop-job.xml
new file mode 100644
index 0000000..366bb33
--- /dev/null
+++ b/crunch-examples/src/main/assembly/hadoop-job.xml
@@ -0,0 +1,41 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<assembly>
+ <id>job</id>
+ <formats>
+ <format>jar</format>
+ </formats>
+ <includeBaseDirectory>false</includeBaseDirectory>
+ <dependencySets>
+ <dependencySet>
+ <unpack>false</unpack>
+ <scope>runtime</scope>
+ <outputDirectory>lib</outputDirectory>
+ <excludes>
+ <exclude>${groupId}:${artifactId}</exclude>
+ </excludes>
+ </dependencySet>
+ <dependencySet>
+ <unpack>true</unpack>
+ <includes>
+ <include>${groupId}:${artifactId}</include>
+ </includes>
+ </dependencySet>
+ </dependencySets>
+</assembly>
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/main/java/org/apache/crunch/examples/AverageBytesByIP.java
----------------------------------------------------------------------
diff --git a/crunch-examples/src/main/java/org/apache/crunch/examples/AverageBytesByIP.java b/crunch-examples/src/main/java/org/apache/crunch/examples/AverageBytesByIP.java
new file mode 100644
index 0000000..868e38a
--- /dev/null
+++ b/crunch-examples/src/main/java/org/apache/crunch/examples/AverageBytesByIP.java
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.examples;
+
+import java.io.Serializable;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.crunch.CombineFn;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+@SuppressWarnings("serial")
+public class AverageBytesByIP extends Configured implements Tool, Serializable {
+ static enum COUNTERS {
+ NO_MATCH,
+ CORRUPT_SIZE
+ }
+
+ static final String logRegex = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+) \"([^\"]+)\" \"([^\"]+)\"";
+
+ public int run(String[] args) throws Exception {
+ if (args.length != 2) {
+ System.err.println();
+ System.err.println("Two and only two arguments are accepted.");
+ System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
+ System.err.println();
+ GenericOptionsParser.printGenericCommandUsage(System.err);
+ return 1;
+ }
+ // Create an object to coordinate pipeline creation and execution.
+ Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
+ // Reference a given text file as a collection of Strings.
+ PCollection<String> lines = pipeline.readTextFile(args[0]);
+
+ // Combiner used for summing up response size and count
+ CombineFn<String, Pair<Long, Long>> stringPairOfLongsSumCombiner = CombineFn.pairAggregator(CombineFn.SUM_LONGS,
+ CombineFn.SUM_LONGS);
+
+ // Table of (ip, sum(response size), count)
+ PTable<String, Pair<Long, Long>> remoteAddrResponseSize = lines
+ .parallelDo(extractResponseSize,
+ Writables.tableOf(Writables.strings(), Writables.pairs(Writables.longs(), Writables.longs()))).groupByKey()
+ .combineValues(stringPairOfLongsSumCombiner);
+
+ // Calculate average response size by ip address
+ PTable<String, Double> avgs = remoteAddrResponseSize.parallelDo(calulateAverage,
+ Writables.tableOf(Writables.strings(), Writables.doubles()));
+
+ // write the result to a text file
+ pipeline.writeTextFile(avgs, args[1]);
+ // Execute the pipeline as a MapReduce.
+ pipeline.done();
+ return 0;
+ }
+
+ // Function to calculate the average response size for a given ip address
+ //
+ // Input: (ip, sum(response size), count)
+ // Output: (ip, average response size)
+ MapFn<Pair<String, Pair<Long, Long>>, Pair<String, Double>> calulateAverage = new MapFn<Pair<String, Pair<Long, Long>>, Pair<String, Double>>() {
+ @Override
+ public Pair<String, Double> map(Pair<String, Pair<Long, Long>> arg) {
+ Pair<Long, Long> sumCount = arg.second();
+ double avg = 0;
+ if (sumCount.second() > 0) {
+ avg = (double) sumCount.first() / (double) sumCount.second();
+ }
+ return Pair.of(arg.first(), avg);
+ }
+ };
+
+ // Function to parse apache log records
+ // Given a standard apache log line, extract the ip address and
+ // response size. Outputs ip and the response size and a count (1) so that
+ // a combiner can be used.
+ //
+ // Input: 55.1.3.2 ...... 200 512 ....
+ // Output: (55.1.3.2, (512, 1))
+ DoFn<String, Pair<String, Pair<Long, Long>>> extractResponseSize = new DoFn<String, Pair<String, Pair<Long, Long>>>() {
+ transient Pattern pattern;
+
+ public void initialize() {
+ pattern = Pattern.compile(logRegex);
+ }
+
+ public void process(String line, Emitter<Pair<String, Pair<Long, Long>>> emitter) {
+ Matcher matcher = pattern.matcher(line);
+ if (matcher.matches()) {
+ try {
+ Long responseSize = Long.parseLong(matcher.group(7));
+ Pair<Long, Long> sumCount = Pair.of(responseSize, 1L);
+ String remoteAddr = matcher.group(1);
+ emitter.emit(Pair.of(remoteAddr, sumCount));
+ } catch (NumberFormatException e) {
+ this.getCounter(COUNTERS.CORRUPT_SIZE).increment(1);
+ }
+ } else {
+ this.getCounter(COUNTERS.NO_MATCH).increment(1);
+ }
+ }
+ };
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new AverageBytesByIP(), args);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/main/java/org/apache/crunch/examples/TotalBytesByIP.java
----------------------------------------------------------------------
diff --git a/crunch-examples/src/main/java/org/apache/crunch/examples/TotalBytesByIP.java b/crunch-examples/src/main/java/org/apache/crunch/examples/TotalBytesByIP.java
new file mode 100644
index 0000000..1953e3a
--- /dev/null
+++ b/crunch-examples/src/main/java/org/apache/crunch/examples/TotalBytesByIP.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.examples;
+
+import java.io.Serializable;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.crunch.CombineFn;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+@SuppressWarnings("serial")
+public class TotalBytesByIP extends Configured implements Tool, Serializable {
+ static enum COUNTERS {
+ NO_MATCH,
+ CORRUPT_SIZE
+ }
+
+ static final String logRegex = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+) \"([^\"]+)\" \"([^\"]+)\"";
+
+ public int run(String[] args) throws Exception {
+ if (args.length != 2) {
+ System.err.println();
+ System.err.println("Two and only two arguments are accepted.");
+ System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
+ System.err.println();
+ GenericOptionsParser.printGenericCommandUsage(System.err);
+ return 1;
+ }
+ // Create an object to coordinate pipeline creation and execution.
+ Pipeline pipeline = new MRPipeline(TotalBytesByIP.class, getConf());
+ // Reference a given text file as a collection of Strings.
+ PCollection<String> lines = pipeline.readTextFile(args[0]);
+
+ // Combiner used for summing up response size
+ CombineFn<String, Long> longSumCombiner = CombineFn.SUM_LONGS();
+
+ // Table of (ip, sum(response size))
+ PTable<String, Long> ipAddrResponseSize = lines
+ .parallelDo(extractIPResponseSize, Writables.tableOf(Writables.strings(), Writables.longs())).groupByKey()
+ .combineValues(longSumCombiner);
+
+ pipeline.writeTextFile(ipAddrResponseSize, args[1]);
+ // Execute the pipeline as a MapReduce.
+ pipeline.done();
+ return 0;
+ }
+
+ // Function to parse apache log records
+ // Given a standard apache log line, extract the ip address and
+ // request size. Outputs the ip and response size.
+ //
+ // Input: 55.1.3.2 ...... 200 512 ....
+ // Output: (55.1.3.2, 512)
+ DoFn<String, Pair<String, Long>> extractIPResponseSize = new DoFn<String, Pair<String, Long>>() {
+ transient Pattern pattern;
+
+ public void initialize() {
+ pattern = Pattern.compile(logRegex);
+ }
+
+ public void process(String line, Emitter<Pair<String, Long>> emitter) {
+ Matcher matcher = pattern.matcher(line);
+ if (matcher.matches()) {
+ try {
+ Long requestSize = Long.parseLong(matcher.group(7));
+ String remoteAddr = matcher.group(1);
+ emitter.emit(Pair.of(remoteAddr, requestSize));
+ } catch (NumberFormatException e) {
+ // corrupt line, we should increment counter
+ }
+ }
+ }
+ };
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new TotalBytesByIP(), args);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/main/java/org/apache/crunch/examples/WordCount.java
----------------------------------------------------------------------
diff --git a/crunch-examples/src/main/java/org/apache/crunch/examples/WordCount.java b/crunch-examples/src/main/java/org/apache/crunch/examples/WordCount.java
new file mode 100644
index 0000000..e4ce25b
--- /dev/null
+++ b/crunch-examples/src/main/java/org/apache/crunch/examples/WordCount.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.examples;
+
+import java.io.Serializable;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+public class WordCount extends Configured implements Tool, Serializable {
+ public int run(String[] args) throws Exception {
+ if (args.length != 3) {
+ System.err.println();
+ System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
+ System.err.println();
+ GenericOptionsParser.printGenericCommandUsage(System.err);
+ return 1;
+ }
+ // Create an object to coordinate pipeline creation and execution.
+ Pipeline pipeline = new MRPipeline(WordCount.class, getConf());
+ // Reference a given text file as a collection of Strings.
+ PCollection<String> lines = pipeline.readTextFile(args[1]);
+
+ // Define a function that splits each line in a PCollection of Strings into
+ // a
+ // PCollection made up of the individual words in the file.
+ PCollection<String> words = lines.parallelDo(new DoFn<String, String>() {
+ public void process(String line, Emitter<String> emitter) {
+ for (String word : line.split("\\s+")) {
+ emitter.emit(word);
+ }
+ }
+ }, Writables.strings()); // Indicates the serialization format
+
+ // The count method applies a series of Crunch primitives and returns
+ // a map of the unique words in the input PCollection to their counts.
+ // Best of all, the count() function doesn't need to know anything about
+ // the kind of data stored in the input PCollection.
+ PTable<String, Long> counts = words.count();
+
+ // Instruct the pipeline to write the resulting counts to a text file.
+ pipeline.writeTextFile(counts, args[2]);
+ // Execute the pipeline as a MapReduce.
+ pipeline.done();
+ return 0;
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new WordCount(), args);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/site/markdown/index.md
----------------------------------------------------------------------
diff --git a/crunch-examples/src/site/markdown/index.md b/crunch-examples/src/site/markdown/index.md
new file mode 100644
index 0000000..838e3ae
--- /dev/null
+++ b/crunch-examples/src/site/markdown/index.md
@@ -0,0 +1,20 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+# Apache Crunch - Examples
+---
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/site/site.xml
----------------------------------------------------------------------
diff --git a/crunch-examples/src/site/site.xml b/crunch-examples/src/site/site.xml
new file mode 100644
index 0000000..73fbd17
--- /dev/null
+++ b/crunch-examples/src/site/site.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="${project.name}"
+ xmlns="http://maven.apache.org/DECORATION/1.3.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/DECORATION/1.3.0
+ http://maven.apache.org/xsd/decoration-1.3.0.xsd">
+
+ <body>
+ <!-- Note: Breadcrumbs for Doxia's Markdown parser are currently broken,
+ see https://jira.codehaus.org/browse/DOXIA-472 -->
+ <breadcrumbs>
+ <item name="Apache" href="http://www.apache.org/index.html" />
+ <item name="Crunch" href="../index.html"/>
+ </breadcrumbs>
+
+ </body>
+
+</project>
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-scrunch/build.sbt
----------------------------------------------------------------------
diff --git a/crunch-scrunch/build.sbt b/crunch-scrunch/build.sbt
new file mode 100644
index 0000000..7d4a6f8
--- /dev/null
+++ b/crunch-scrunch/build.sbt
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+name := "scrunch"
+
+version := "0.2.0"
+
+scalaVersion := "2.9.2"
+
+resolvers ++= Seq(
+ "Local Maven Repository" at "file://"+Path.userHome.absolutePath+"/.m2/repository",
+ "Hadoop Releases" at "https://repository.cloudera.com/content/repositories/releases/"
+)
+
+libraryDependencies ++= Seq(
+ "org.apache.crunch" % "crunch" % "0.3.0" excludeAll(
+ ExclusionRule(organization = "com.sun.jdmk"),
+ ExclusionRule(organization = "com.sun.jmx"),
+ ExclusionRule(organization = "javax.jms")
+ ),
+ "org.apache.hadoop" % "hadoop-client" % "0.20.2-cdh3u4" % "provided" excludeAll(
+ ExclusionRule(organization = "com.sun.jdmk"),
+ ExclusionRule(organization = "com.sun.jmx"),
+ ExclusionRule(organization = "javax.jms")
+ ),
+ "org.apache.hbase" % "hbase" % "0.90.6-cdh3u4" % "provided" excludeAll(
+ ExclusionRule(organization = "org.apache.hadoop"),
+ ExclusionRule(organization = "commons-logging"),
+ ExclusionRule(organization = "com.google.guava"),
+ ExclusionRule(organization = "log4j"),
+ ExclusionRule(organization = "org.slf4j")
+ ),
+ "junit" % "junit" % "4.8.1" % "test",
+ "org.scalatest" % "scalatest_2.9.2" % "1.7.2" % "test"
+)
+
+parallelExecution in Test := false
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-scrunch/pom.xml
----------------------------------------------------------------------
diff --git a/crunch-scrunch/pom.xml b/crunch-scrunch/pom.xml
new file mode 100644
index 0000000..0e3eac8
--- /dev/null
+++ b/crunch-scrunch/pom.xml
@@ -0,0 +1,199 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.crunch</groupId>
+ <artifactId>crunch-parent</artifactId>
+ <version>0.3.0-incubating-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>crunch-scrunch</artifactId>
+ <name>Crunch for Scala</name>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.scala-lang</groupId>
+ <artifactId>scala-library</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.scala-lang</groupId>
+ <artifactId>scala-compiler</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.scala-lang</groupId>
+ <artifactId>jline</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.crunch</groupId>
+ <artifactId>crunch</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-client</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hbase</groupId>
+ <artifactId>hbase</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.scalatest</groupId>
+ <artifactId>scalatest_${scala.version}</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.crunch</groupId>
+ <artifactId>crunch-test</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>jar-with-dependencies</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ <configuration>
+ <descriptorRefs>
+ <descriptorRef>jar-with-dependencies</descriptorRef>
+ </descriptorRefs>
+ </configuration>
+ </execution>
+ <execution>
+ <id>make-assembly</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ <configuration>
+ <descriptors>
+ <descriptor>${basedir}/src/main/assembly/release.xml</descriptor>
+ </descriptors>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.scala-tools</groupId>
+ <artifactId>maven-scala-plugin</artifactId>
+ <executions>
+ <execution>
+ <goals>
+ <goal>compile</goal>
+ <goal>testCompile</goal>
+ </goals>
+ <configuration>
+ <args>
+ <arg>-deprecation</arg>
+ <arg>-dependencyfile</arg>
+ <arg>${project.build.directory}/.scala_dependencies</arg>
+ </args>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <useFile>false</useFile>
+ <disableXmlReport>true</disableXmlReport>
+ <includes>
+ <include>${project.build.testSourceDirectory}/**/*Test.*</include>
+ <include>${project.build.testSourceDirectory}/**/*Suite.*</include>
+ </includes>
+ </configuration>
+ </plugin>
+ <!-- We put slow-running tests into src/it and run them during the
+ integration-test phase using the failsafe plugin. This way
+ developers can run unit tests conveniently from the IDE or via
+ "mvn package" from the command line without triggering time
+ consuming integration tests. -->
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>build-helper-maven-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>add-test-source</id>
+ <phase>validate</phase>
+ <goals>
+ <goal>add-test-source</goal>
+ </goals>
+ <configuration>
+ <sources>
+ <source>${basedir}/src/it/java</source>
+ <source>${basedir}/src/it/scala</source>
+ </sources>
+ </configuration>
+ </execution>
+ <execution>
+ <id>add-test-resource</id>
+ <phase>validate</phase>
+ <goals>
+ <goal>add-test-resource</goal>
+ </goals>
+ <configuration>
+ <resources>
+ <resource>
+ <directory>${basedir}/src/it/resources</directory>
+ </resource>
+ </resources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <configuration>
+ <testSourceDirectory>${basedir}/src/it/scala</testSourceDirectory>
+ <useFile>false</useFile>
+ <disableXmlReport>true</disableXmlReport>
+ <includes>
+ <include>**/*Test.*</include>
+ <include>**/*Suite.*</include>
+ </includes>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>integration-test</goal>
+ <goal>verify</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+</project>
http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-scrunch/src/it/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/resources/log4j.properties b/crunch-scrunch/src/it/resources/log4j.properties
new file mode 100644
index 0000000..c7847f8
--- /dev/null
+++ b/crunch-scrunch/src/it/resources/log4j.properties
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ***** Set root logger level to INFO and its only appender to A.
+log4j.logger.org.apache.crunch=info, A
+
+# Log warnings on Hadoop for the local runner when testing
+log4j.logger.org.apache.hadoop=warn, A
+
+# ***** A is set to be a ConsoleAppender.
+log4j.appender.A=org.apache.log4j.ConsoleAppender
+# ***** A uses PatternLayout.
+log4j.appender.A.layout=org.apache.log4j.PatternLayout
+log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
+