You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@hbase.apache.org by Ryan Rawson <ry...@gmail.com> on 2009/04/22 06:53:11 UTC
Randomize your input during huge imports
Here is the MapReduce I use to randomize the lines of a file. I've omitted
the imports for brevity - your IDE can fix that.
Enjoy!
-ryan
public class Randomize {
// technically text/text could be 'object'.
public static class Map extends MapReduceBase implements
Mapper<LongWritable, Text, IntWritable, Text> {
Random rnd = new Random();
public void map(LongWritable key, Text value,
OutputCollector<IntWritable, Text> output, Reporter reporter)
throws IOException {
IntWritable redKey = new IntWritable(rnd.nextInt(100000));
output.collect(redKey, value);
reporter.setStatus("Map emitting cell for: " + redKey);
}
}
// This combiner reduces the time of a map-reduce from 1h18m -> 48m.
// That is a 38% improvement (!!).
public static class Combiner extends MapReduceBase
implements Reducer<IntWritable, Text, IntWritable, Text> {
public void reduce(IntWritable key, Iterator<Text> values,
OutputCollector<IntWritable, Text> output, Reporter reporter)
throws IOException {
Text out = new Text();
byte newline [] = {'\n'};
int siz = 0;
while (values.hasNext())
{
Text txt = values.next();
out.append(txt.getBytes(), 0, txt.getLength());
if (++siz > 500) {
output.collect(key, out);
siz = 0;
out = new Text();
} else {
if (values.hasNext())
out.append(newline, 0, newline.length);
}
}
output.collect(key, out);
}
}
public static class Reduce extends MapReduceBase implements
Reducer<IntWritable, Text, NullWritable, Text> {
public void reduce(IntWritable key, Iterator<Text> values,
OutputCollector<NullWritable, Text> output, Reporter reporter)
throws IOException {
while (values.hasNext())
{
output.collect(NullWritable.get(), values.next());
}
}
}
public static void main(String [] argv) throws IOException {
if (argv.length < 2) {
System.out.println("Usage: <input> <randomized output>");
return;
}
JobConf job = new JobConf(Randomize.class);
job.setJobName("Randomize: " + argv[0]);
FileInputFormat.setInputPaths(job, new Path(argv[0]));
job.setInputFormat(TextInputFormat.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setCombinerClass(Combiner.class);
FileOutputFormat.setOutputPath(job, new Path(argv[1]));
job.setOutputFormat(TextOutputFormat.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
JobClient jc = new JobClient(job);
jc.submitJob(job);
}
}
Re: Randomize your input during huge imports
Posted by stack <st...@duboce.net>.
Should we add this to examples Ryan? Or do you think it general enough that
we add it under src and add it as an option when you do:
./bin/hadoop -jar hbase.jar
Currently the only option when you do the above is rowcounter.
Is below generic enough that we can add a 'randomizer' option on above?
St.Ack
On Tue, Apr 21, 2009 at 9:53 PM, Ryan Rawson <ry...@gmail.com> wrote:
> Here is the MapReduce I use to randomize the lines of a file. I've omitted
> the imports for brevity - your IDE can fix that.
>
> Enjoy!
> -ryan
>
> public class Randomize {
>
> // technically text/text could be 'object'.
> public static class Map extends MapReduceBase implements
> Mapper<LongWritable, Text, IntWritable, Text> {
> Random rnd = new Random();
>
> public void map(LongWritable key, Text value,
> OutputCollector<IntWritable, Text> output, Reporter reporter)
> throws IOException {
> IntWritable redKey = new IntWritable(rnd.nextInt(100000));
> output.collect(redKey, value);
> reporter.setStatus("Map emitting cell for: " + redKey);
>
> }
>
> }
>
> // This combiner reduces the time of a map-reduce from 1h18m -> 48m.
> // That is a 38% improvement (!!).
> public static class Combiner extends MapReduceBase
> implements Reducer<IntWritable, Text, IntWritable, Text> {
>
> public void reduce(IntWritable key, Iterator<Text> values,
> OutputCollector<IntWritable, Text> output, Reporter reporter)
> throws IOException {
> Text out = new Text();
> byte newline [] = {'\n'};
> int siz = 0;
> while (values.hasNext())
> {
> Text txt = values.next();
> out.append(txt.getBytes(), 0, txt.getLength());
>
> if (++siz > 500) {
> output.collect(key, out);
> siz = 0;
> out = new Text();
> } else {
> if (values.hasNext())
> out.append(newline, 0, newline.length);
> }
> }
> output.collect(key, out);
> }
> }
>
> public static class Reduce extends MapReduceBase implements
> Reducer<IntWritable, Text, NullWritable, Text> {
> public void reduce(IntWritable key, Iterator<Text> values,
> OutputCollector<NullWritable, Text> output, Reporter reporter)
> throws IOException {
> while (values.hasNext())
> {
> output.collect(NullWritable.get(), values.next());
> }
> }
>
> }
> public static void main(String [] argv) throws IOException {
> if (argv.length < 2) {
> System.out.println("Usage: <input> <randomized output>");
> return;
> }
> JobConf job = new JobConf(Randomize.class);
> job.setJobName("Randomize: " + argv[0]);
> FileInputFormat.setInputPaths(job, new Path(argv[0]));
> job.setInputFormat(TextInputFormat.class);
> job.setMapperClass(Map.class);
> job.setReducerClass(Reduce.class);
> job.setCombinerClass(Combiner.class);
> FileOutputFormat.setOutputPath(job, new Path(argv[1]));
> job.setOutputFormat(TextOutputFormat.class);
>
> job.setOutputKeyClass(IntWritable.class);
> job.setOutputValueClass(Text.class);
>
> JobClient jc = new JobClient(job);
> jc.submitJob(job);
> }
> }
>