You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@crunch.apache.org by jw...@apache.org on 2012/08/08 02:47:05 UTC

[2/10] git commit: CRUNCH-32: Clean up namespaces.

CRUNCH-32: Clean up namespaces.

Rename Maven module "scrunch" to "crunch-scrunch".
Change artifactId of "scrunch" to "crunch-scrunch".
Move package "org.apache.scrunch" to "org.apache.crunch.scrunch".
Rename Maven module "examples" to "crunch-examples".

Signed-off-by: jwills <jw...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/incubator-crunch/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-crunch/commit/dfd28922
Tree: http://git-wip-us.apache.org/repos/asf/incubator-crunch/tree/dfd28922
Diff: http://git-wip-us.apache.org/repos/asf/incubator-crunch/diff/dfd28922

Branch: refs/heads/master
Commit: dfd28922d6c5740db3d5170d94db546d22077ba4
Parents: a92a523
Author: Matthias Friedrich <ma...@mafr.de>
Authored: Sun Aug 5 20:03:36 2012 +0200
Committer: jwills <jw...@apache.org>
Committed: Tue Aug 7 09:17:40 2012 -0700

----------------------------------------------------------------------
 crunch-examples/pom.xml                            |   78 +
 crunch-examples/src/main/assembly/hadoop-job.xml   |   41 +
 .../apache/crunch/examples/AverageBytesByIP.java   |  134 +
 .../org/apache/crunch/examples/TotalBytesByIP.java |  106 +
 .../java/org/apache/crunch/examples/WordCount.java |   76 +
 crunch-examples/src/site/markdown/index.md         |   20 +
 crunch-examples/src/site/site.xml                  |   34 +
 crunch-scrunch/build.sbt                           |   51 +
 crunch-scrunch/pom.xml                             |  199 +
 crunch-scrunch/src/it/resources/log4j.properties   |   28 +
 crunch-scrunch/src/it/resources/maugham.txt        |29112 +++++++++++++++
 crunch-scrunch/src/it/resources/shakes.txt         | 3667 ++
 crunch-scrunch/src/it/resources/urls.txt           |   11 +
 .../org/apache/crunch/scrunch/CogroupTest.scala    |   42 +
 .../scala/org/apache/crunch/scrunch/JoinTest.scala |   45 +
 .../apache/crunch/scrunch/PageRankClassTest.scala  |  119 +
 .../org/apache/crunch/scrunch/PageRankTest.scala   |  105 +
 .../apache/crunch/scrunch/PipelineAppTest.scala    |   49 +
 .../scala/org/apache/crunch/scrunch/TopTest.scala  |   42 +
 .../org/apache/crunch/scrunch/UnionTest.scala      |   52 +
 .../org/apache/crunch/scrunch/WordCountTest.scala  |   42 +
 .../scrunch/interpreter/InterpreterJarTest.scala   |   69 +
 crunch-scrunch/src/main/assembly/release.xml       |   93 +
 crunch-scrunch/src/main/conf/log4j.properties      |   24 +
 .../src/main/examples/ClassyPageRank.scala         |   71 +
 crunch-scrunch/src/main/examples/PageRank.scala    |   61 +
 crunch-scrunch/src/main/examples/WordCount.scala   |   27 +
 .../crunch/scrunch/ScalaReflectDataFactory.java    |   41 +
 .../crunch/scrunch/ScalaSafeReflectData.java       |  292 +
 .../scrunch/ScalaSafeReflectDatumReader.java       |  122 +
 .../scrunch/ScalaSafeReflectDatumWriter.java       |   68 +
 .../org/apache/crunch/scrunch/Conversions.scala    |  147 +
 .../apache/crunch/scrunch/EmbeddedPipeline.scala   |   47 +
 .../crunch/scrunch/EmbeddedPipelineLike.scala      |  127 +
 .../main/scala/org/apache/crunch/scrunch/IO.scala  |   50 +
 .../main/scala/org/apache/crunch/scrunch/Mem.scala |   88 +
 .../org/apache/crunch/scrunch/PCollection.scala    |  118 +
 .../apache/crunch/scrunch/PCollectionLike.scala    |   48 +
 .../org/apache/crunch/scrunch/PGroupedTable.scala  |   92 +
 .../scala/org/apache/crunch/scrunch/PTable.scala   |  191 +
 .../org/apache/crunch/scrunch/PTypeFamily.scala    |  127 +
 .../scala/org/apache/crunch/scrunch/Pipeline.scala |  164 +
 .../org/apache/crunch/scrunch/PipelineApp.scala    |   64 +
 .../org/apache/crunch/scrunch/PipelineHelper.scala |   74 +
 .../org/apache/crunch/scrunch/PipelineLike.scala   |   91 +
 .../scrunch/interpreter/InterpreterRunner.scala    |  208 +
 crunch-scrunch/src/main/scripts/imports.scala      |   19 +
 crunch-scrunch/src/main/scripts/scrunch            |  163 +
 crunch-scrunch/src/main/scripts/scrunch-job.py     |  133 +
 crunch-scrunch/src/site/markdown/index.md          |   20 +
 crunch-scrunch/src/site/site.xml                   |   34 +
 examples/pom.xml                                   |   78 -
 examples/src/main/assembly/hadoop-job.xml          |   41 -
 .../apache/crunch/examples/AverageBytesByIP.java   |  134 -
 .../org/apache/crunch/examples/TotalBytesByIP.java |  106 -
 .../java/org/apache/crunch/examples/WordCount.java |   76 -
 examples/src/site/markdown/index.md                |   20 -
 examples/src/site/site.xml                         |   34 -
 pom.xml                                            |    4 +-
 scrunch/build.sbt                                  |   51 -
 scrunch/pom.xml                                    |  199 -
 scrunch/src/it/resources/log4j.properties          |   29 -
 scrunch/src/it/resources/maugham.txt               |29112 ---------------
 scrunch/src/it/resources/shakes.txt                | 3667 --
 scrunch/src/it/resources/urls.txt                  |   11 -
 .../it/scala/org/apache/scrunch/CogroupTest.scala  |   42 -
 .../src/it/scala/org/apache/scrunch/JoinTest.scala |   45 -
 .../org/apache/scrunch/PageRankClassTest.scala     |  119 -
 .../it/scala/org/apache/scrunch/PageRankTest.scala |  105 -
 .../scala/org/apache/scrunch/PipelineAppTest.scala |   49 -
 .../src/it/scala/org/apache/scrunch/TopTest.scala  |   42 -
 .../it/scala/org/apache/scrunch/UnionTest.scala    |   52 -
 .../scala/org/apache/scrunch/WordCountTest.scala   |   42 -
 .../scrunch/interpreter/InterpreterJarTest.scala   |   69 -
 scrunch/src/main/assembly/release.xml              |   93 -
 scrunch/src/main/conf/log4j.properties             |   24 -
 scrunch/src/main/examples/ClassyPageRank.scala     |   71 -
 scrunch/src/main/examples/PageRank.scala           |   61 -
 scrunch/src/main/examples/WordCount.scala          |   27 -
 .../apache/scrunch/ScalaReflectDataFactory.java    |   41 -
 .../org/apache/scrunch/ScalaSafeReflectData.java   |  292 -
 .../scrunch/ScalaSafeReflectDatumReader.java       |  122 -
 .../scrunch/ScalaSafeReflectDatumWriter.java       |   68 -
 .../scala/org/apache/scrunch/Conversions.scala     |  147 -
 .../org/apache/scrunch/EmbeddedPipeline.scala      |   47 -
 .../org/apache/scrunch/EmbeddedPipelineLike.scala  |  127 -
 scrunch/src/main/scala/org/apache/scrunch/IO.scala |   50 -
 .../src/main/scala/org/apache/scrunch/Mem.scala    |   88 -
 .../scala/org/apache/scrunch/PCollection.scala     |  118 -
 .../scala/org/apache/scrunch/PCollectionLike.scala |   48 -
 .../scala/org/apache/scrunch/PGroupedTable.scala   |   92 -
 .../src/main/scala/org/apache/scrunch/PTable.scala |  191 -
 .../scala/org/apache/scrunch/PTypeFamily.scala     |  127 -
 .../main/scala/org/apache/scrunch/Pipeline.scala   |  164 -
 .../scala/org/apache/scrunch/PipelineApp.scala     |   64 -
 .../scala/org/apache/scrunch/PipelineHelper.scala  |   74 -
 .../scala/org/apache/scrunch/PipelineLike.scala    |   91 -
 .../scrunch/interpreter/InterpreterRunner.scala    |  208 -
 scrunch/src/main/scripts/imports.scala             |   19 -
 scrunch/src/main/scripts/scrunch                   |  163 -
 scrunch/src/main/scripts/scrunch-job.py            |  133 -
 scrunch/src/site/markdown/index.md                 |   20 -
 scrunch/src/site/site.xml                          |   34 -
 src/site/markdown/scrunch.md                       |    7 +-
 104 files changed, 36931 insertions(+), 36933 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/pom.xml
----------------------------------------------------------------------
diff --git a/crunch-examples/pom.xml b/crunch-examples/pom.xml
new file mode 100644
index 0000000..4b3113f
--- /dev/null
+++ b/crunch-examples/pom.xml
@@ -0,0 +1,78 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.crunch</groupId>
+    <artifactId>crunch-parent</artifactId>
+    <version>0.3.0-incubating-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>crunch-examples</artifactId>
+  <name>Crunch Examples</name>
+
+  <dependencies>
+    <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava</artifactId>
+    </dependency>
+
+   <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-client</artifactId>
+      <scope>provided</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.crunch</groupId>
+      <artifactId>crunch</artifactId>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <configuration>
+          <descriptors>
+            <descriptor>src/main/assembly/hadoop-job.xml</descriptor>
+          </descriptors>
+          <archive>
+            <manifest>
+              <mainClass>org.apache.crunch.examples.WordCount</mainClass>
+            </manifest>
+          </archive>
+        </configuration>
+        <executions>
+          <execution>
+            <id>make-assembly</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>

http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/main/assembly/hadoop-job.xml
----------------------------------------------------------------------
diff --git a/crunch-examples/src/main/assembly/hadoop-job.xml b/crunch-examples/src/main/assembly/hadoop-job.xml
new file mode 100644
index 0000000..366bb33
--- /dev/null
+++ b/crunch-examples/src/main/assembly/hadoop-job.xml
@@ -0,0 +1,41 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<assembly>
+  <id>job</id>
+  <formats>
+    <format>jar</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+  <dependencySets>
+    <dependencySet>
+      <unpack>false</unpack>
+      <scope>runtime</scope>
+      <outputDirectory>lib</outputDirectory>
+      <excludes>
+        <exclude>${groupId}:${artifactId}</exclude>
+      </excludes>
+    </dependencySet>
+    <dependencySet>
+      <unpack>true</unpack>
+      <includes>
+        <include>${groupId}:${artifactId}</include>
+      </includes>
+    </dependencySet>
+  </dependencySets>
+</assembly>

http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/main/java/org/apache/crunch/examples/AverageBytesByIP.java
----------------------------------------------------------------------
diff --git a/crunch-examples/src/main/java/org/apache/crunch/examples/AverageBytesByIP.java b/crunch-examples/src/main/java/org/apache/crunch/examples/AverageBytesByIP.java
new file mode 100644
index 0000000..868e38a
--- /dev/null
+++ b/crunch-examples/src/main/java/org/apache/crunch/examples/AverageBytesByIP.java
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.examples;
+
+import java.io.Serializable;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.crunch.CombineFn;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.MapFn;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+@SuppressWarnings("serial")
+public class AverageBytesByIP extends Configured implements Tool, Serializable {
+  static enum COUNTERS {
+    NO_MATCH,
+    CORRUPT_SIZE
+  }
+
+  static final String logRegex = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+) \"([^\"]+)\" \"([^\"]+)\"";
+
+  public int run(String[] args) throws Exception {
+    if (args.length != 2) {
+      System.err.println();
+      System.err.println("Two and only two arguments are accepted.");
+      System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
+      System.err.println();
+      GenericOptionsParser.printGenericCommandUsage(System.err);
+      return 1;
+    }
+    // Create an object to coordinate pipeline creation and execution.
+    Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
+    // Reference a given text file as a collection of Strings.
+    PCollection<String> lines = pipeline.readTextFile(args[0]);
+
+    // Combiner used for summing up response size and count
+    CombineFn<String, Pair<Long, Long>> stringPairOfLongsSumCombiner = CombineFn.pairAggregator(CombineFn.SUM_LONGS,
+        CombineFn.SUM_LONGS);
+
+    // Table of (ip, sum(response size), count)
+    PTable<String, Pair<Long, Long>> remoteAddrResponseSize = lines
+        .parallelDo(extractResponseSize,
+            Writables.tableOf(Writables.strings(), Writables.pairs(Writables.longs(), Writables.longs()))).groupByKey()
+        .combineValues(stringPairOfLongsSumCombiner);
+
+    // Calculate average response size by ip address
+    PTable<String, Double> avgs = remoteAddrResponseSize.parallelDo(calulateAverage,
+        Writables.tableOf(Writables.strings(), Writables.doubles()));
+
+    // write the result to a text file
+    pipeline.writeTextFile(avgs, args[1]);
+    // Execute the pipeline as a MapReduce.
+    pipeline.done();
+    return 0;
+  }
+
+  // Function to calculate the average response size for a given ip address
+  //
+  // Input: (ip, sum(response size), count)
+  // Output: (ip, average response size)
+  MapFn<Pair<String, Pair<Long, Long>>, Pair<String, Double>> calulateAverage = new MapFn<Pair<String, Pair<Long, Long>>, Pair<String, Double>>() {
+    @Override
+    public Pair<String, Double> map(Pair<String, Pair<Long, Long>> arg) {
+      Pair<Long, Long> sumCount = arg.second();
+      double avg = 0;
+      if (sumCount.second() > 0) {
+        avg = (double) sumCount.first() / (double) sumCount.second();
+      }
+      return Pair.of(arg.first(), avg);
+    }
+  };
+
+  // Function to parse apache log records
+  // Given a standard apache log line, extract the ip address and
+  // response size. Outputs ip and the response size and a count (1) so that
+  // a combiner can be used.
+  //
+  // Input: 55.1.3.2 ...... 200 512 ....
+  // Output: (55.1.3.2, (512, 1))
+  DoFn<String, Pair<String, Pair<Long, Long>>> extractResponseSize = new DoFn<String, Pair<String, Pair<Long, Long>>>() {
+    transient Pattern pattern;
+
+    public void initialize() {
+      pattern = Pattern.compile(logRegex);
+    }
+
+    public void process(String line, Emitter<Pair<String, Pair<Long, Long>>> emitter) {
+      Matcher matcher = pattern.matcher(line);
+      if (matcher.matches()) {
+        try {
+          Long responseSize = Long.parseLong(matcher.group(7));
+          Pair<Long, Long> sumCount = Pair.of(responseSize, 1L);
+          String remoteAddr = matcher.group(1);
+          emitter.emit(Pair.of(remoteAddr, sumCount));
+        } catch (NumberFormatException e) {
+          this.getCounter(COUNTERS.CORRUPT_SIZE).increment(1);
+        }
+      } else {
+        this.getCounter(COUNTERS.NO_MATCH).increment(1);
+      }
+    }
+  };
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new AverageBytesByIP(), args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/main/java/org/apache/crunch/examples/TotalBytesByIP.java
----------------------------------------------------------------------
diff --git a/crunch-examples/src/main/java/org/apache/crunch/examples/TotalBytesByIP.java b/crunch-examples/src/main/java/org/apache/crunch/examples/TotalBytesByIP.java
new file mode 100644
index 0000000..1953e3a
--- /dev/null
+++ b/crunch-examples/src/main/java/org/apache/crunch/examples/TotalBytesByIP.java
@@ -0,0 +1,106 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.examples;
+
+import java.io.Serializable;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.crunch.CombineFn;
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pair;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+@SuppressWarnings("serial")
+public class TotalBytesByIP extends Configured implements Tool, Serializable {
+  static enum COUNTERS {
+    NO_MATCH,
+    CORRUPT_SIZE
+  }
+
+  static final String logRegex = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\d+) \"([^\"]+)\" \"([^\"]+)\"";
+
+  public int run(String[] args) throws Exception {
+    if (args.length != 2) {
+      System.err.println();
+      System.err.println("Two and only two arguments are accepted.");
+      System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
+      System.err.println();
+      GenericOptionsParser.printGenericCommandUsage(System.err);
+      return 1;
+    }
+    // Create an object to coordinate pipeline creation and execution.
+    Pipeline pipeline = new MRPipeline(TotalBytesByIP.class, getConf());
+    // Reference a given text file as a collection of Strings.
+    PCollection<String> lines = pipeline.readTextFile(args[0]);
+
+    // Combiner used for summing up response size
+    CombineFn<String, Long> longSumCombiner = CombineFn.SUM_LONGS();
+
+    // Table of (ip, sum(response size))
+    PTable<String, Long> ipAddrResponseSize = lines
+        .parallelDo(extractIPResponseSize, Writables.tableOf(Writables.strings(), Writables.longs())).groupByKey()
+        .combineValues(longSumCombiner);
+
+    pipeline.writeTextFile(ipAddrResponseSize, args[1]);
+    // Execute the pipeline as a MapReduce.
+    pipeline.done();
+    return 0;
+  }
+
+  // Function to parse apache log records
+  // Given a standard apache log line, extract the ip address and
+  // request size. Outputs the ip and response size.
+  //
+  // Input: 55.1.3.2 ...... 200 512 ....
+  // Output: (55.1.3.2, 512)
+  DoFn<String, Pair<String, Long>> extractIPResponseSize = new DoFn<String, Pair<String, Long>>() {
+    transient Pattern pattern;
+
+    public void initialize() {
+      pattern = Pattern.compile(logRegex);
+    }
+
+    public void process(String line, Emitter<Pair<String, Long>> emitter) {
+      Matcher matcher = pattern.matcher(line);
+      if (matcher.matches()) {
+        try {
+          Long requestSize = Long.parseLong(matcher.group(7));
+          String remoteAddr = matcher.group(1);
+          emitter.emit(Pair.of(remoteAddr, requestSize));
+        } catch (NumberFormatException e) {
+          // corrupt line, we should increment counter
+        }
+      }
+    }
+  };
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new TotalBytesByIP(), args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/main/java/org/apache/crunch/examples/WordCount.java
----------------------------------------------------------------------
diff --git a/crunch-examples/src/main/java/org/apache/crunch/examples/WordCount.java b/crunch-examples/src/main/java/org/apache/crunch/examples/WordCount.java
new file mode 100644
index 0000000..e4ce25b
--- /dev/null
+++ b/crunch-examples/src/main/java/org/apache/crunch/examples/WordCount.java
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.crunch.examples;
+
+import java.io.Serializable;
+
+import org.apache.crunch.DoFn;
+import org.apache.crunch.Emitter;
+import org.apache.crunch.PCollection;
+import org.apache.crunch.PTable;
+import org.apache.crunch.Pipeline;
+import org.apache.crunch.impl.mr.MRPipeline;
+import org.apache.crunch.types.writable.Writables;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+public class WordCount extends Configured implements Tool, Serializable {
+  public int run(String[] args) throws Exception {
+    if (args.length != 3) {
+      System.err.println();
+      System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
+      System.err.println();
+      GenericOptionsParser.printGenericCommandUsage(System.err);
+      return 1;
+    }
+    // Create an object to coordinate pipeline creation and execution.
+    Pipeline pipeline = new MRPipeline(WordCount.class, getConf());
+    // Reference a given text file as a collection of Strings.
+    PCollection<String> lines = pipeline.readTextFile(args[1]);
+
+    // Define a function that splits each line in a PCollection of Strings into
+    // a
+    // PCollection made up of the individual words in the file.
+    PCollection<String> words = lines.parallelDo(new DoFn<String, String>() {
+      public void process(String line, Emitter<String> emitter) {
+        for (String word : line.split("\\s+")) {
+          emitter.emit(word);
+        }
+      }
+    }, Writables.strings()); // Indicates the serialization format
+
+    // The count method applies a series of Crunch primitives and returns
+    // a map of the unique words in the input PCollection to their counts.
+    // Best of all, the count() function doesn't need to know anything about
+    // the kind of data stored in the input PCollection.
+    PTable<String, Long> counts = words.count();
+
+    // Instruct the pipeline to write the resulting counts to a text file.
+    pipeline.writeTextFile(counts, args[2]);
+    // Execute the pipeline as a MapReduce.
+    pipeline.done();
+    return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new WordCount(), args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/site/markdown/index.md
----------------------------------------------------------------------
diff --git a/crunch-examples/src/site/markdown/index.md b/crunch-examples/src/site/markdown/index.md
new file mode 100644
index 0000000..838e3ae
--- /dev/null
+++ b/crunch-examples/src/site/markdown/index.md
@@ -0,0 +1,20 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+# Apache Crunch - Examples
+---

http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-examples/src/site/site.xml
----------------------------------------------------------------------
diff --git a/crunch-examples/src/site/site.xml b/crunch-examples/src/site/site.xml
new file mode 100644
index 0000000..73fbd17
--- /dev/null
+++ b/crunch-examples/src/site/site.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project name="${project.name}"
+  xmlns="http://maven.apache.org/DECORATION/1.3.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/DECORATION/1.3.0
+                      http://maven.apache.org/xsd/decoration-1.3.0.xsd">
+
+  <body>
+    <!-- Note: Breadcrumbs for Doxia's Markdown parser are currently broken,
+               see https://jira.codehaus.org/browse/DOXIA-472 -->
+    <breadcrumbs>
+      <item name="Apache" href="http://www.apache.org/index.html" />
+      <item name="Crunch" href="../index.html"/>
+    </breadcrumbs>
+
+  </body>
+
+</project>

http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-scrunch/build.sbt
----------------------------------------------------------------------
diff --git a/crunch-scrunch/build.sbt b/crunch-scrunch/build.sbt
new file mode 100644
index 0000000..7d4a6f8
--- /dev/null
+++ b/crunch-scrunch/build.sbt
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+name := "scrunch"
+
+version := "0.2.0"
+
+scalaVersion := "2.9.2"
+
+resolvers ++= Seq(
+  "Local Maven Repository" at "file://"+Path.userHome.absolutePath+"/.m2/repository",
+  "Hadoop Releases" at "https://repository.cloudera.com/content/repositories/releases/"
+)
+
+libraryDependencies ++= Seq(
+  "org.apache.crunch" % "crunch" % "0.3.0" excludeAll(
+    ExclusionRule(organization = "com.sun.jdmk"),
+    ExclusionRule(organization = "com.sun.jmx"),
+    ExclusionRule(organization = "javax.jms")
+  ),
+  "org.apache.hadoop" % "hadoop-client" % "0.20.2-cdh3u4" % "provided" excludeAll(
+    ExclusionRule(organization = "com.sun.jdmk"),
+    ExclusionRule(organization = "com.sun.jmx"),
+    ExclusionRule(organization = "javax.jms")
+  ),
+  "org.apache.hbase" % "hbase" % "0.90.6-cdh3u4" % "provided" excludeAll(
+    ExclusionRule(organization = "org.apache.hadoop"),
+    ExclusionRule(organization = "commons-logging"),
+    ExclusionRule(organization = "com.google.guava"),
+    ExclusionRule(organization = "log4j"),
+    ExclusionRule(organization = "org.slf4j")
+  ),
+  "junit" % "junit" % "4.8.1" % "test",
+  "org.scalatest" % "scalatest_2.9.2" % "1.7.2" % "test"
+)
+
+parallelExecution in Test := false

http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-scrunch/pom.xml
----------------------------------------------------------------------
diff --git a/crunch-scrunch/pom.xml b/crunch-scrunch/pom.xml
new file mode 100644
index 0000000..0e3eac8
--- /dev/null
+++ b/crunch-scrunch/pom.xml
@@ -0,0 +1,199 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.crunch</groupId>
+    <artifactId>crunch-parent</artifactId>
+    <version>0.3.0-incubating-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>crunch-scrunch</artifactId>
+  <name>Crunch for Scala</name>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-compiler</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>jline</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.crunch</groupId>
+      <artifactId>crunch</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-client</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hbase</groupId>
+      <artifactId>hbase</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.scalatest</groupId>
+      <artifactId>scalatest_${scala.version}</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.crunch</groupId>
+      <artifactId>crunch-test</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>jar-with-dependencies</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+            <configuration>
+              <descriptorRefs>
+                <descriptorRef>jar-with-dependencies</descriptorRef>
+              </descriptorRefs>
+            </configuration>
+          </execution>
+          <execution>
+            <id>make-assembly</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+            <configuration>
+              <descriptors>
+                <descriptor>${basedir}/src/main/assembly/release.xml</descriptor>
+              </descriptors>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.scala-tools</groupId>
+        <artifactId>maven-scala-plugin</artifactId>
+        <executions>
+          <execution>
+            <goals>
+              <goal>compile</goal>
+              <goal>testCompile</goal>
+            </goals>
+            <configuration>
+              <args>
+                <arg>-deprecation</arg>
+                <arg>-dependencyfile</arg>
+                <arg>${project.build.directory}/.scala_dependencies</arg>
+              </args>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-surefire-plugin</artifactId>
+        <configuration>
+          <useFile>false</useFile>
+          <disableXmlReport>true</disableXmlReport>
+          <includes>
+            <include>${project.build.testSourceDirectory}/**/*Test.*</include>
+            <include>${project.build.testSourceDirectory}/**/*Suite.*</include>
+          </includes>
+        </configuration>
+      </plugin>
+      <!-- We put slow-running tests into src/it and run them during the
+           integration-test phase using the failsafe plugin. This way
+           developers can run unit tests conveniently from the IDE or via
+           "mvn package" from the command line without triggering time
+           consuming integration tests. -->
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>build-helper-maven-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>add-test-source</id>
+            <phase>validate</phase>
+            <goals>
+              <goal>add-test-source</goal>
+            </goals>
+            <configuration>
+              <sources>
+                <source>${basedir}/src/it/java</source>
+                <source>${basedir}/src/it/scala</source>
+              </sources>
+            </configuration>
+          </execution>
+          <execution>
+            <id>add-test-resource</id>
+            <phase>validate</phase>
+            <goals>
+              <goal>add-test-resource</goal>
+            </goals>
+            <configuration>
+              <resources>
+                  <resource>
+                    <directory>${basedir}/src/it/resources</directory>
+                  </resource>
+              </resources>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-failsafe-plugin</artifactId>
+        <configuration>
+          <testSourceDirectory>${basedir}/src/it/scala</testSourceDirectory>
+          <useFile>false</useFile>
+          <disableXmlReport>true</disableXmlReport>
+          <includes>
+            <include>**/*Test.*</include>
+            <include>**/*Suite.*</include>
+          </includes>
+        </configuration>
+        <executions>
+          <execution>
+            <goals>
+              <goal>integration-test</goal>
+              <goal>verify</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>

http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/dfd28922/crunch-scrunch/src/it/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/crunch-scrunch/src/it/resources/log4j.properties b/crunch-scrunch/src/it/resources/log4j.properties
new file mode 100644
index 0000000..c7847f8
--- /dev/null
+++ b/crunch-scrunch/src/it/resources/log4j.properties
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# ***** Set root logger level to INFO and its only appender to A.
+log4j.logger.org.apache.crunch=info, A
+
+# Log warnings on Hadoop for the local runner when testing
+log4j.logger.org.apache.hadoop=warn, A
+
+# ***** A is set to be a ConsoleAppender.
+log4j.appender.A=org.apache.log4j.ConsoleAppender
+# ***** A uses PatternLayout.
+log4j.appender.A.layout=org.apache.log4j.PatternLayout
+log4j.appender.A.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
+