You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@avro.apache.org by cu...@apache.org on 2013/12/19 21:58:47 UTC
svn commit: r1552425 - in /avro/trunk: ./ doc/examples/java-example/
doc/examples/mr-example/ doc/examples/mr-example/src/main/java/example/
doc/src/content/xdocs/
Author: cutting
Date: Thu Dec 19 20:58:46 2013
New Revision: 1552425
URL: http://svn.apache.org/r1552425
Log:
AVRO-1225. Java: Add guide for MapReduce API. Contributed by Brock Noland.
Added:
avro/trunk/doc/examples/mr-example/src/main/java/example/MapReduceColorCount.java (with props)
avro/trunk/doc/examples/mr-example/src/main/java/example/MapredColorCount.java
- copied, changed from r1552397, avro/trunk/doc/examples/mr-example/src/main/java/example/ColorCount.java
Removed:
avro/trunk/doc/examples/mr-example/src/main/java/example/ColorCount.java
Modified:
avro/trunk/CHANGES.txt
avro/trunk/doc/examples/java-example/pom.xml
avro/trunk/doc/examples/mr-example/ (props changed)
avro/trunk/doc/examples/mr-example/pom.xml
avro/trunk/doc/src/content/xdocs/mr.xml
Modified: avro/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/avro/trunk/CHANGES.txt?rev=1552425&r1=1552424&r2=1552425&view=diff
==============================================================================
--- avro/trunk/CHANGES.txt (original)
+++ avro/trunk/CHANGES.txt Thu Dec 19 20:58:46 2013
@@ -65,6 +65,8 @@ Trunk (not yet released)
AVRO-1063. Ruby: Use multi_json instead of requiring yajl.
(Duke via cutting)
+ AVRO-1225. Java: Add guide for MapReduce API. (Brock Noland via cutting)
+
BUG FIXES
AVRO-1368. Fix SpecificDatumWriter to, when writing a string
Modified: avro/trunk/doc/examples/java-example/pom.xml
URL: http://svn.apache.org/viewvc/avro/trunk/doc/examples/java-example/pom.xml?rev=1552425&r1=1552424&r2=1552425&view=diff
==============================================================================
--- avro/trunk/doc/examples/java-example/pom.xml (original)
+++ avro/trunk/doc/examples/java-example/pom.xml Thu Dec 19 20:58:46 2013
@@ -17,7 +17,7 @@
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
- <version>1.7.2</version>
+ <version>1.7.5</version>
</dependency>
</dependencies>
<build>
@@ -25,7 +25,7 @@
<plugin>
<groupId>org.apache.avro</groupId>
<artifactId>avro-maven-plugin</artifactId>
- <version>1.7.2</version>
+ <version>1.7.5</version>
<executions>
<execution>
<phase>generate-sources</phase>
Propchange: avro/trunk/doc/examples/mr-example/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Thu Dec 19 20:58:46 2013
@@ -0,0 +1,3 @@
+target
+input
+output
Modified: avro/trunk/doc/examples/mr-example/pom.xml
URL: http://svn.apache.org/viewvc/avro/trunk/doc/examples/mr-example/pom.xml?rev=1552425&r1=1552424&r2=1552425&view=diff
==============================================================================
--- avro/trunk/doc/examples/mr-example/pom.xml (original)
+++ avro/trunk/doc/examples/mr-example/pom.xml Thu Dec 19 20:58:46 2013
@@ -22,7 +22,7 @@
<plugin>
<groupId>org.apache.avro</groupId>
<artifactId>avro-maven-plugin</artifactId>
- <version>1.7.2</version>
+ <version>1.7.5</version>
<executions>
<execution>
<phase>generate-sources</phase>
@@ -31,7 +31,7 @@
</goals>
<configuration>
<sourceDirectory>${project.basedir}/../</sourceDirectory>
- <outputDirectory>${project.basedir}/src/main/java/</outputDirectory>
+ <outputDirectory>${project.build.directory}/generated-sources/java</outputDirectory>
</configuration>
</execution>
</executions>
@@ -43,12 +43,12 @@
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
- <version>1.7.2</version>
+ <version>1.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro-mapred</artifactId>
- <version>1.7.2</version>
+ <version>1.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
Added: avro/trunk/doc/examples/mr-example/src/main/java/example/MapReduceColorCount.java
URL: http://svn.apache.org/viewvc/avro/trunk/doc/examples/mr-example/src/main/java/example/MapReduceColorCount.java?rev=1552425&view=auto
==============================================================================
--- avro/trunk/doc/examples/mr-example/src/main/java/example/MapReduceColorCount.java (added)
+++ avro/trunk/doc/examples/mr-example/src/main/java/example/MapReduceColorCount.java Thu Dec 19 20:58:46 2013
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package example;
+
+import java.io.IOException;
+
+import org.apache.avro.Schema;
+import org.apache.avro.mapred.AvroKey;
+import org.apache.avro.mapred.AvroValue;
+import org.apache.avro.mapreduce.AvroJob;
+import org.apache.avro.mapreduce.AvroKeyInputFormat;
+import org.apache.avro.mapreduce.AvroKeyValueOutputFormat;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+import example.avro.User;
+
+public class MapReduceColorCount extends Configured implements Tool {
+
+ public static class ColorCountMapper extends
+ Mapper<AvroKey<User>, NullWritable, Text, IntWritable> {
+
+ @Override
+ public void map(AvroKey<User> key, NullWritable value, Context context)
+ throws IOException, InterruptedException {
+
+ CharSequence color = key.datum().getFavoriteColor();
+ if (color == null) {
+ color = "none";
+ }
+ context.write(new Text(color.toString()), new IntWritable(1));
+ }
+ }
+
+ public static class ColorCountReducer extends
+ Reducer<Text, IntWritable, AvroKey<CharSequence>, AvroValue<Integer>> {
+
+ @Override
+ public void reduce(Text key, Iterable<IntWritable> values,
+ Context context) throws IOException, InterruptedException {
+
+ int sum = 0;
+ for (IntWritable value : values) {
+ sum += value.get();
+ }
+ context.write(new AvroKey<CharSequence>(key.toString()), new AvroValue<Integer>(sum));
+ }
+ }
+
+ public int run(String[] args) throws Exception {
+ if (args.length != 2) {
+ System.err.println("Usage: MapReduceColorCount <input path> <output path>");
+ return -1;
+ }
+
+ Job job = new Job(getConf());
+ job.setJarByClass(MapReduceColorCount.class);
+ job.setJobName("Color Count");
+
+ FileInputFormat.setInputPaths(job, new Path(args[0]));
+ FileOutputFormat.setOutputPath(job, new Path(args[1]));
+
+ job.setInputFormatClass(AvroKeyInputFormat.class);
+ job.setMapperClass(ColorCountMapper.class);
+ AvroJob.setInputKeySchema(job, User.getClassSchema());
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(IntWritable.class);
+
+ job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
+ job.setReducerClass(ColorCountReducer.class);
+ AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
+ AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));
+
+ return (job.waitForCompletion(true) ? 0 : 1);
+ }
+
+ public static void main(String[] args) throws Exception {
+ int res = ToolRunner.run(new MapReduceColorCount(), args);
+ System.exit(res);
+ }
+}
Propchange: avro/trunk/doc/examples/mr-example/src/main/java/example/MapReduceColorCount.java
------------------------------------------------------------------------------
svn:eol-style = native
Copied: avro/trunk/doc/examples/mr-example/src/main/java/example/MapredColorCount.java (from r1552397, avro/trunk/doc/examples/mr-example/src/main/java/example/ColorCount.java)
URL: http://svn.apache.org/viewvc/avro/trunk/doc/examples/mr-example/src/main/java/example/MapredColorCount.java?p2=avro/trunk/doc/examples/mr-example/src/main/java/example/MapredColorCount.java&p1=avro/trunk/doc/examples/mr-example/src/main/java/example/ColorCount.java&r1=1552397&r2=1552425&rev=1552425&view=diff
==============================================================================
--- avro/trunk/doc/examples/mr-example/src/main/java/example/ColorCount.java (original)
+++ avro/trunk/doc/examples/mr-example/src/main/java/example/MapredColorCount.java Thu Dec 19 20:58:46 2013
@@ -30,7 +30,7 @@ import org.apache.hadoop.util.*;
import example.avro.User;
-public class ColorCount extends Configured implements Tool {
+public class MapredColorCount extends Configured implements Tool {
public static class ColorCountMapper extends AvroMapper<User, Pair<CharSequence, Integer>> {
@Override
@@ -62,11 +62,11 @@ public class ColorCount extends Configur
public int run(String[] args) throws Exception {
if (args.length != 2) {
- System.err.println("Usage: ColorCount <input path> <output path>");
+ System.err.println("Usage: MapredColorCount <input path> <output path>");
return -1;
}
- JobConf conf = new JobConf(getConf(), ColorCount.class);
+ JobConf conf = new JobConf(getConf(), MapredColorCount.class);
conf.setJobName("colorcount");
FileInputFormat.setInputPaths(conf, new Path(args[0]));
@@ -78,7 +78,7 @@ public class ColorCount extends Configur
// Note that AvroJob.setInputSchema and AvroJob.setOutputSchema set
// relevant config options such as input/output format, map output
// classes, and output key class.
- AvroJob.setInputSchema(conf, User.SCHEMA$);
+ AvroJob.setInputSchema(conf, User.getClassSchema());
AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING),
Schema.create(Type.INT)));
@@ -87,7 +87,7 @@ public class ColorCount extends Configur
}
public static void main(String[] args) throws Exception {
- int res = ToolRunner.run(new Configuration(), new ColorCount(), args);
+ int res = ToolRunner.run(new Configuration(), new MapredColorCount(), args);
System.exit(res);
}
}
Modified: avro/trunk/doc/src/content/xdocs/mr.xml
URL: http://svn.apache.org/viewvc/avro/trunk/doc/src/content/xdocs/mr.xml?rev=1552425&r1=1552424&r2=1552425&view=diff
==============================================================================
--- avro/trunk/doc/src/content/xdocs/mr.xml (original)
+++ avro/trunk/doc/src/content/xdocs/mr.xml Thu Dec 19 20:58:46 2013
@@ -104,11 +104,16 @@
<section>
<title>Example: ColorCount</title>
<p>
- Below is a simple example of a MapReduce that uses Avro. This example
- can be found in the Avro docs under
- <em>examples/mr-example/src/main/java/example/ColorCount.java</em>.
- We'll go over the specifics of what's going on in subsequent sections.
+ Below is a simple example of a MapReduce that uses Avro. There is an example
+ for both the old (<em>org.apache.hadoop.mapred</em>) and new
+ (<em>org.apache.hadoop.mapreduce</em>) APIs under
+ <em>examples/mr-example/src/main/java/example/</em>. <em>MapredColorCount</em>
+ is the example for the older mapred API while <em>MapReduceColorCount</em> is
+ the example for the newer mapreduce API. Both examples are below, but
+ we will detail the mapred API in our subsequent examples.
</p>
+
+ <p>MapredColorCount:</p>
<source>
package example;
@@ -124,7 +129,7 @@ import org.apache.hadoop.util.*;
import example.avro.User;
-public class ColorCount extends Configured implements Tool {
+public class MapredColorCount extends Configured implements Tool {
public static class ColorCountMapper extends AvroMapper<User, Pair<CharSequence, Integer>> {
@Override
@@ -156,11 +161,11 @@ public class ColorCount extends Configur
public int run(String[] args) throws Exception {
if (args.length != 2) {
- System.err.println("Usage: ColorCount <input path> <output path>");
+ System.err.println("Usage: MapredColorCount <input path> <output path>");
return -1;
}
- JobConf conf = new JobConf(getConf(), ColorCount.class);
+ JobConf conf = new JobConf(getConf(), MapredColorCount.class);
conf.setJobName("colorcount");
FileInputFormat.setInputPaths(conf, new Path(args[0]));
@@ -172,7 +177,7 @@ public class ColorCount extends Configur
// Note that AvroJob.setInputSchema and AvroJob.setOutputSchema set
// relevant config options such as input/output format, map output
// classes, and output key class.
- AvroJob.setInputSchema(conf, User.SCHEMA$);
+ AvroJob.setInputSchema(conf, User.getClassSchema());
AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING),
Schema.create(Type.INT)));
@@ -181,11 +186,105 @@ public class ColorCount extends Configur
}
public static void main(String[] args) throws Exception {
- int res = ToolRunner.run(new Configuration(), new ColorCount(), args);
+ int res = ToolRunner.run(new Configuration(), new MapredColorCount(), args);
System.exit(res);
}
}
</source>
+
+ <p>MapReduceColorCount:</p>
+ <source>
+package example;
+
+import java.io.IOException;
+
+import org.apache.avro.Schema;
+import org.apache.avro.mapred.AvroKey;
+import org.apache.avro.mapred.AvroValue;
+import org.apache.avro.mapreduce.AvroJob;
+import org.apache.avro.mapreduce.AvroKeyInputFormat;
+import org.apache.avro.mapreduce.AvroKeyValueOutputFormat;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+import example.avro.User;
+
+public class MapReduceColorCount extends Configured implements Tool {
+
+ public static class ColorCountMapper extends
+ Mapper<AvroKey<User>, NullWritable, Text, IntWritable> {
+
+ @Override
+ public void map(AvroKey<User> key, NullWritable value, Context context)
+ throws IOException, InterruptedException {
+
+ CharSequence color = key.datum().getFavoriteColor();
+ if (color == null) {
+ color = "none";
+ }
+ context.write(new Text(color.toString()), new IntWritable(1));
+ }
+ }
+
+ public static class ColorCountReducer extends
+ Reducer<Text, IntWritable, AvroKey<CharSequence>, AvroValue<Integer>> {
+
+ @Override
+ public void reduce(Text key, Iterable<IntWritable> values,
+ Context context) throws IOException, InterruptedException {
+
+ int sum = 0;
+ for (IntWritable value : values) {
+ sum += value.get();
+ }
+ context.write(new AvroKey<CharSequence>(key.toString()), new AvroValue<Integer>(sum));
+ }
+ }
+
+ public int run(String[] args) throws Exception {
+ if (args.length != 2) {
+ System.err.println("Usage: MapReduceColorCount <input path> <output path>");
+ return -1;
+ }
+
+ Job job = new Job(getConf());
+ job.setJarByClass(MapReduceColorCount.class);
+ job.setJobName("Color Count");
+
+ FileInputFormat.setInputPaths(job, new Path(args[0]));
+ FileOutputFormat.setOutputPath(job, new Path(args[1]));
+
+ job.setInputFormatClass(AvroKeyInputFormat.class);
+ job.setMapperClass(ColorCountMapper.class);
+ AvroJob.setInputKeySchema(job, User.getClassSchema());
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(IntWritable.class);
+
+ job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
+ job.setReducerClass(ColorCountReducer.class);
+ AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
+ AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));
+
+ return (job.waitForCompletion(true) ? 0 : 1);
+ }
+
+ public static void main(String[] args) throws Exception {
+ int res = ToolRunner.run(new MapReduceColorCount(), args);
+ System.exit(res);
+ }
+}
+ </source>
+
<p>
ColorCount reads in data files containing <code>User</code> records,
defined in <em>examples/user.avsc</em>, and counts the number of
@@ -229,8 +328,8 @@ public class ColorCount extends Configur
mvn compile
</source>
<p>
- Next, run GenerateData to create an Avro data file,
- <em>input/users.avro</em>, containing 20 <code>User</code>s with
+ Next, run GenerateData from examples/mr-examples to create an Avro data
+ file, <em>input/users.avro</em>, containing 20 <code>User</code>s with
favorite colors chosen randomly from a list:
</p>
<source>
@@ -254,7 +353,7 @@ mvn exec:java -q -Dexec.mainClass=exampl
folder already exists):
</p>
<source>
-mvn exec:java -q -Dexec.mainClass=example.ColorCount -Dexec.args="input output"
+mvn exec:java -q -Dexec.mainClass=example.MapredColorCount -Dexec.args="input output"
</source>
<p>
Once ColorCount completes, checking the contents of the new
@@ -281,7 +380,7 @@ $ java -jar /path/to/avro-tools-&AvroVer
</section>
<p>Now let's go over the ColorCount example in detail.</p>
<section>
- <title>AvroMapper</title>
+ <title>Mapper - org.apache.hadoop.mapred API</title>
<p>
The easiest way to use Avro data files as input to a MapReduce job is to
subclass <code>AvroMapper</code>. An <code>AvroMapper</code> defines a
@@ -314,7 +413,7 @@ public static class ColorCountMapper ext
</p>
<source>
AvroJob.setMapperClass(conf, ColorCountMapper.class);
-AvroJob.setInputSchema(conf, User.SCHEMA$);
+AvroJob.setInputSchema(conf, User.getClassSchema());
</source>
<p>
Note that <code>AvroMapper</code> does not implement the
@@ -332,7 +431,33 @@ AvroJob.setInputSchema(conf, User.SCHEMA
</p>
</section>
<section>
- <title>AvroReducer</title>
+ <title>Mapper - org.apache.hadoop.mapreduce API</title>
+ <p>
+ This document will not go into all the differences between the mapred and mapreduce APIs,
+ however will describe the main differences. As you can see, ColorCountMapper is now a
+ subclass of the Hadoop Mapper class and is passed an AvroKey as it's key.
+
+ Additionally, the AvroJob method calls were slightly changed.
+ </p>
+ <source>
+ public static class ColorCountMapper extends
+ Mapper<AvroKey<User>, NullWritable, Text, IntWritable> {
+
+ @Override
+ public void map(AvroKey<User> key, NullWritable value, Context context)
+ throws IOException, InterruptedException {
+
+ CharSequence color = key.datum().getFavoriteColor();
+ if (color == null) {
+ color = "none";
+ }
+ context.write(new Text(color.toString()), new IntWritable(1));
+ }
+ }
+ </source>
+ </section>
+ <section>
+ <title>Reducer - org.apache.hadoop.mapred API</title>
<p>
Analogously to <code>AvroMapper</code>, an <code>AvroReducer</code>
defines a reducer function that takes the key/value types output by an
@@ -387,18 +512,48 @@ AvroJob.setOutputSchema(conf, Pair.getPa
</p>
</section>
<section>
+ <title>Reduce - org.apache.hadoop.mapreduce API</title>
+ <p>
+ As before we not detail every difference between the APIs. As with the Mapper
+ change ColorCountReducer is now a subclass of Reducer and AvroKey and AvroValue
+ are emitted.
+
+ Additionally, the AvroJob method calls were slightly changed.
+ </p>
+ <source>
+ public static class ColorCountReducer extends
+ Reducer<Text, IntWritable, AvroKey<CharSequence>, AvroValue<Integer>> {
+
+ @Override
+ public void reduce(Text key, Iterable<IntWritable> values,
+ Context context) throws IOException, InterruptedException {
+
+ int sum = 0;
+ for (IntWritable value : values) {
+ sum += value.get();
+ }
+ context.write(new AvroKey<CharSequence>(key.toString()), new AvroValue<Integer>(sum));
+ }
+ }
+ </source>
+ </section>
+ <section>
<title>Learning more</title>
<p>
- It's possible to mix <code>AvroMapper</code>s and
+ The mapred API allows users to mix Avro <code>AvroMapper</code>s and
<code>AvroReducer</code>s with non-Avro <code>Mapper</code>s and
- <code>Reducer</code>s. See the <a
+ <code>Reducer</code>s and the mapreduce API allows users input Avro
+ and output non-Avro or vice versa.
+ </p>
+
+ <p>
+ The mapred package has api <a
href="http://avro.apache.org/docs/current/api/java/org/apache/avro/mapred/package-summary.html">
- <code>org.apache.avro.mapred</code> documentation</a> for more details.
- There is also a <a
+ <code>org.apache.avro.mapred</code> documentation</a> as does the <a
href="http://avro.apache.org/docs/current/api/java/org/apache/avro/mapreduce/package-summary.html">
- <code>org.apache.avro.mapreduce</code> package</a> for use with the new
- MapReduce API (<code>org.apache.hadoop.mapreduce</code>). It's also
- possible to implement your own <code>Mapper</code>s and
+ <code>org.apache.avro.mapreduce</code> package</a>.
+ MapReduce API (<code>org.apache.hadoop.mapreduce</code>). Similarily to the mapreduce package,
+ it's possible with the mapred API to implement your own <code>Mapper</code>s and
<code>Reducer</code>s directly using the public classes provided in
these libraries. See the AvroWordCount application, found under
<em>examples/mr-example/src/main/java/example/AvroWordCount.java</em> in