You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@hbase.apache.org by Ryan Rawson <ry...@gmail.com> on 2009/04/22 06:53:11 UTC

Randomize your input during huge imports

Here is the MapReduce I use to randomize the lines of a file.  I've omitted
the imports for brevity - your IDE can fix that.

Enjoy!
-ryan

public class Randomize {

  // technically text/text could be 'object'.
  public static class Map extends MapReduceBase implements
Mapper<LongWritable, Text, IntWritable, Text> {
    Random rnd = new Random();

    public void map(LongWritable key, Text value,
        OutputCollector<IntWritable, Text> output, Reporter reporter)
    throws IOException {
      IntWritable redKey = new IntWritable(rnd.nextInt(100000));
      output.collect(redKey, value);
      reporter.setStatus("Map emitting cell for: " + redKey);

    }

  }

  // This combiner reduces the time of a map-reduce from 1h18m -> 48m.
  // That is a 38% improvement (!!).
  public static class Combiner extends MapReduceBase
  implements Reducer<IntWritable, Text, IntWritable, Text> {

    public void reduce(IntWritable key, Iterator<Text> values,
        OutputCollector<IntWritable, Text> output, Reporter reporter)
        throws IOException {
      Text out = new Text();
      byte newline [] = {'\n'};
      int siz = 0;
      while (values.hasNext())
      {
        Text txt = values.next();
        out.append(txt.getBytes(), 0, txt.getLength());

        if (++siz > 500) {
          output.collect(key, out);
          siz = 0;
          out = new Text();
        } else {
          if (values.hasNext())
            out.append(newline, 0, newline.length);
        }
      }
      output.collect(key, out);
    }
  }

  public static class Reduce extends MapReduceBase implements
Reducer<IntWritable, Text, NullWritable, Text> {
    public void reduce(IntWritable key, Iterator<Text> values,
        OutputCollector<NullWritable, Text> output, Reporter reporter)
    throws IOException {
      while (values.hasNext())
      {
        output.collect(NullWritable.get(), values.next());
      }
    }

  }
  public static void main(String [] argv) throws IOException {
    if (argv.length < 2) {
      System.out.println("Usage: <input> <randomized output>");
      return;
    }
    JobConf job = new JobConf(Randomize.class);
    job.setJobName("Randomize: " + argv[0]);
    FileInputFormat.setInputPaths(job, new Path(argv[0]));
    job.setInputFormat(TextInputFormat.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    job.setCombinerClass(Combiner.class);
    FileOutputFormat.setOutputPath(job, new Path(argv[1]));
    job.setOutputFormat(TextOutputFormat.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    JobClient jc = new JobClient(job);
    jc.submitJob(job);
  }
}

Re: Randomize your input during huge imports

Posted by stack <st...@duboce.net>.
Should we add this to examples Ryan?  Or do you think it general enough that
we add it under src and add it as an option when you do:

./bin/hadoop -jar hbase.jar

Currently the only option when you do the above is rowcounter.

Is below generic enough that we can add a 'randomizer' option on above?

St.Ack

On Tue, Apr 21, 2009 at 9:53 PM, Ryan Rawson <ry...@gmail.com> wrote:

> Here is the MapReduce I use to randomize the lines of a file.  I've omitted
> the imports for brevity - your IDE can fix that.
>
> Enjoy!
> -ryan
>
> public class Randomize {
>
>  // technically text/text could be 'object'.
>  public static class Map extends MapReduceBase implements
> Mapper<LongWritable, Text, IntWritable, Text> {
>    Random rnd = new Random();
>
>    public void map(LongWritable key, Text value,
>        OutputCollector<IntWritable, Text> output, Reporter reporter)
>    throws IOException {
>      IntWritable redKey = new IntWritable(rnd.nextInt(100000));
>      output.collect(redKey, value);
>      reporter.setStatus("Map emitting cell for: " + redKey);
>
>    }
>
>  }
>
>  // This combiner reduces the time of a map-reduce from 1h18m -> 48m.
>  // That is a 38% improvement (!!).
>  public static class Combiner extends MapReduceBase
>  implements Reducer<IntWritable, Text, IntWritable, Text> {
>
>    public void reduce(IntWritable key, Iterator<Text> values,
>        OutputCollector<IntWritable, Text> output, Reporter reporter)
>        throws IOException {
>      Text out = new Text();
>      byte newline [] = {'\n'};
>      int siz = 0;
>      while (values.hasNext())
>      {
>        Text txt = values.next();
>        out.append(txt.getBytes(), 0, txt.getLength());
>
>        if (++siz > 500) {
>          output.collect(key, out);
>          siz = 0;
>          out = new Text();
>        } else {
>          if (values.hasNext())
>            out.append(newline, 0, newline.length);
>        }
>      }
>      output.collect(key, out);
>    }
>  }
>
>  public static class Reduce extends MapReduceBase implements
> Reducer<IntWritable, Text, NullWritable, Text> {
>    public void reduce(IntWritable key, Iterator<Text> values,
>        OutputCollector<NullWritable, Text> output, Reporter reporter)
>    throws IOException {
>      while (values.hasNext())
>      {
>        output.collect(NullWritable.get(), values.next());
>      }
>    }
>
>  }
>  public static void main(String [] argv) throws IOException {
>    if (argv.length < 2) {
>      System.out.println("Usage: <input> <randomized output>");
>      return;
>    }
>    JobConf job = new JobConf(Randomize.class);
>    job.setJobName("Randomize: " + argv[0]);
>    FileInputFormat.setInputPaths(job, new Path(argv[0]));
>    job.setInputFormat(TextInputFormat.class);
>    job.setMapperClass(Map.class);
>    job.setReducerClass(Reduce.class);
>    job.setCombinerClass(Combiner.class);
>    FileOutputFormat.setOutputPath(job, new Path(argv[1]));
>    job.setOutputFormat(TextOutputFormat.class);
>
>    job.setOutputKeyClass(IntWritable.class);
>    job.setOutputValueClass(Text.class);
>
>    JobClient jc = new JobClient(job);
>    jc.submitJob(job);
>  }
> }
>