You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@hbase.apache.org by Alex Newman <po...@gmail.com> on 2008/07/03 05:19:38 UTC

SampleUploader for the trunk hbase

/// This is very close to the example in the javadoc
already(Bytes,BatchUpdate) instead of (text/mapwritable), and i find
it to be the easiest way to get people started/motivated with HBase.


package org.apache.hadoop.hbase.mapred;

import org.apache.hadoop.hbase.util.Bytes;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.BatchUpdate;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class SampleUploader extends MapReduceBase
implements Mapper<LongWritable, Text, ImmutableBytesWritable, BatchUpdate>, Tool
 {
  private static final String NAME = "SampleUploader";
  private Configuration conf;

  public JobConf createSubmittableJob(String[] args) {
    JobConf c = new JobConf(getConf(), SampleUploader.class);
    c.setJobName(NAME);
    c.setInputPath(new Path(args[0]));
    c.setMapperClass(this.getClass());
    c.setMapOutputKeyClass(ImmutableBytesWritable.class);
    c.setMapOutputValueClass(BatchUpdate.class);
    c.setReducerClass(TableUploader.class);
    TableReduce.initJob(args[1], TableUploader.class, c);
    return c;
  }

  public void map(LongWritable k, Text v,
    OutputCollector<ImmutableBytesWritable, BatchUpdate> output, Reporter r)
  throws IOException {
    // Lines are space-delimited; first item is row, next the columnname and
  // then the third the cell value.
    String tmp = v.toString();
    if (tmp.length() == 0) {
      return;
    }
    String [] splits = v.toString().split(" ");
    String row = splits[0];
    BatchUpdate mw = new  BatchUpdate(row);

    mw.put( "count:",  Bytes.toBytes(splits[1]));
    r.setStatus("Map emitting " + row + " for record " + k.toString());
    output.collect(new ImmutableBytesWritable(row.getBytes()), mw);
  }

  public static class TableUploader extends TableReduce<ImmutableBytesWritable,
BatchUpdate> {


  @Override
  public void reduce(ImmutableBytesWritable key, Iterator<BatchUpdate> values,
      OutputCollector<ImmutableBytesWritable, BatchUpdate> output,
      @SuppressWarnings("unused") Reporter reporter)
      throws IOException {

    while(values.hasNext()) {
      output.collect(key, values.next());
    }
  }
}


  static int printUsage() {
    System.out.println(NAME + " <input> <table_name>");
    return -1;
  }

  public int run(@SuppressWarnings("unused") String[] args) throws Exception {
    // Make sure there are exactly 2 parameters left.
    if (args.length != 2) {
      System.out.println("ERROR: Wrong number of parameters: " +
        args.length + " instead of 2.");
      return printUsage();
    }
    JobClient.runJob(createSubmittableJob(args));
    return 0;
  }
  System.out.println("ERROR: Wrong number of parameters: " +
        args.length + " instead of 2.");
      return printUsage();
    }
    JobClient.runJob(createSubmittableJob(args));
    return 0;
  }

  public Configuration getConf() {
    return this.conf;
  }

  public void setConf(final Configuration c) {
    this.conf = c;
  }

  public static void main(String[] args) throws Exception {
    int errCode = ToolRunner.run(new Configuration(), new SampleUploader(),
      args);
    System.exit(errCode);
  }
}

Re: SampleUploader for the trunk hbase

Posted by stack <st...@duboce.net>.
We'd welcome the contrib.  If sufficiently 'generic', we should just 
check it in under our examples directory.
St.Ack

Dan Zinngrabe wrote:
> It would just need a little cleanup and it would be ready to go. We made it
> generic enough that it should work for just about anyone, with a few (minor)
> caveats. We used it to transition a fairly flat, large mysql table into
> HBase as well as for production backups and restore.
>
> On Mon, Jul 7, 2008 at 9:43 PM, stack <st...@duboce.net> wrote:
>
>   
>> Sounds good.  Would it make sense your posting your code or is it too
>> particular to your setup?
>> St.Ack
>>
>>
>> Dan Zinngrabe wrote:
>>
>>     
>>> We have something very similar, but a little more flexible in use in
>>> production. Along with it is a simple exporter that outputs hbase data in
>>> the same format.
>>> It's not quite HBASE-50 <
>>>
>>> https://issues.apache.org/jira/browse/HBASE-50?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12580996#action_12580996
>>>  but it does work pretty well so far for backing up even large tables.
>>>
>>> On Thu, Jul 3, 2008 at 10:48 AM, stack <st...@duboce.net> wrote:
>>>
>>>
>>>
>>>       
>>>> Thanks Alex.  Looks great.  You want me to add it to the wiki?  (Or you
>>>> could do it yourself).
>>>>
>>>> Yours does something slightly different it seems; you hardcode the column
>>>> name and do a count of splits[1].  You might add to the class comment a
>>>> description of what your MR job does.
>>>>
>>>> St.Ack
>>>>
>>>>
>>>>
>>>> Alex Newman wrote:
>>>>
>>>>
>>>>
>>>>         
>>>>> /// This is very close to the example in the javadoc
>>>>> already(Bytes,BatchUpdate) instead of (text/mapwritable), and i find
>>>>> it to be the easiest way to get people started/motivated with HBase.
>>>>>
>>>>>
>>>>> package org.apache.hadoop.hbase.mapred;
>>>>>
>>>>> import org.apache.hadoop.hbase.util.Bytes;
>>>>>
>>>>> import java.io.IOException;
>>>>> import java.util.Iterator;
>>>>>
>>>>> import org.apache.hadoop.conf.Configuration;
>>>>> import org.apache.hadoop.fs.Path;
>>>>> import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
>>>>> import org.apache.hadoop.hbase.io.BatchUpdate;
>>>>>
>>>>> import org.apache.hadoop.io.LongWritable;
>>>>> import org.apache.hadoop.io.MapWritable;
>>>>> import org.apache.hadoop.io.Text;
>>>>> import org.apache.hadoop.mapred.JobClient;
>>>>> import org.apache.hadoop.mapred.JobConf;
>>>>> import org.apache.hadoop.mapred.MapReduceBase;
>>>>> import org.apache.hadoop.mapred.Mapper;
>>>>> import org.apache.hadoop.mapred.OutputCollector;
>>>>> import org.apache.hadoop.mapred.Reporter;
>>>>> import org.apache.hadoop.util.Tool;
>>>>> import org.apache.hadoop.util.ToolRunner;
>>>>> public class SampleUploader extends MapReduceBase
>>>>> implements Mapper<LongWritable, Text, ImmutableBytesWritable,
>>>>> BatchUpdate>, Tool
>>>>>  {
>>>>>  private static final String NAME = "SampleUploader";
>>>>>  private Configuration conf;
>>>>>
>>>>>  public JobConf createSubmittableJob(String[] args) {
>>>>>   JobConf c = new JobConf(getConf(), SampleUploader.class);
>>>>>   c.setJobName(NAME);
>>>>>   c.setInputPath(new Path(args[0]));
>>>>>   c.setMapperClass(this.getClass());
>>>>>   c.setMapOutputKeyClass(ImmutableBytesWritable.class);
>>>>>   c.setMapOutputValueClass(BatchUpdate.class);
>>>>>   c.setReducerClass(TableUploader.class);
>>>>>   TableReduce.initJob(args[1], TableUploader.class, c);
>>>>>   return c;
>>>>>  }
>>>>>
>>>>>  public void map(LongWritable k, Text v,
>>>>>   OutputCollector<ImmutableBytesWritable, BatchUpdate> output, Reporter
>>>>> r)
>>>>>  throws IOException {
>>>>>   // Lines are space-delimited; first item is row, next the columnname
>>>>> and
>>>>>  // then the third the cell value.
>>>>>   String tmp = v.toString();
>>>>>   if (tmp.length() == 0) {
>>>>>     return;
>>>>>   }
>>>>>   String [] splits = v.toString().split(" ");
>>>>>   String row = splits[0];
>>>>>   BatchUpdate mw = new  BatchUpdate(row);
>>>>>
>>>>>   mw.put( "count:",  Bytes.toBytes(splits[1]));
>>>>>   r.setStatus("Map emitting " + row + " for record " + k.toString());
>>>>>   output.collect(new ImmutableBytesWritable(row.getBytes()), mw);
>>>>>  }
>>>>>
>>>>>  public static class TableUploader extends
>>>>> TableReduce<ImmutableBytesWritable,
>>>>> BatchUpdate> {
>>>>>
>>>>>
>>>>>  @Override
>>>>>  public void reduce(ImmutableBytesWritable key, Iterator<BatchUpdate>
>>>>> values,
>>>>>     OutputCollector<ImmutableBytesWritable, BatchUpdate> output,
>>>>>     @SuppressWarnings("unused") Reporter reporter)
>>>>>     throws IOException {
>>>>>
>>>>>   while(values.hasNext()) {
>>>>>     output.collect(key, values.next());
>>>>>   }
>>>>>  }
>>>>> }
>>>>>
>>>>>
>>>>>  static int printUsage() {
>>>>>   System.out.println(NAME + " <input> <table_name>");
>>>>>   return -1;
>>>>>  }
>>>>>
>>>>>  public int run(@SuppressWarnings("unused") String[] args) throws
>>>>> Exception {
>>>>>   // Make sure there are exactly 2 parameters left.
>>>>>   if (args.length != 2) {
>>>>>     System.out.println("ERROR: Wrong number of parameters: " +
>>>>>       args.length + " instead of 2.");
>>>>>     return printUsage();
>>>>>   }
>>>>>   JobClient.runJob(createSubmittableJob(args));
>>>>>   return 0;
>>>>>  }
>>>>>  System.out.println("ERROR: Wrong number of parameters: " +
>>>>>       args.length + " instead of 2.");
>>>>>     return printUsage();
>>>>>   }
>>>>>   JobClient.runJob(createSubmittableJob(args));
>>>>>   return 0;
>>>>>  }
>>>>>
>>>>>  public Configuration getConf() {
>>>>>   return this.conf;
>>>>>  }
>>>>>
>>>>>  public void setConf(final Configuration c) {
>>>>>   this.conf = c;
>>>>>  }
>>>>>
>>>>>  public static void main(String[] args) throws Exception {
>>>>>   int errCode = ToolRunner.run(new Configuration(), new
>>>>> SampleUploader(),
>>>>>     args);
>>>>>   System.exit(errCode);
>>>>>  }
>>>>> }
>>>>>
>>>>>
>>>>>
>>>>>
>>>>>           
>>>>         
>>>
>>>
>>>       
>>     
>
>
>   


Re: SampleUploader for the trunk hbase

Posted by Dan Zinngrabe <da...@mahalo.com>.
It would just need a little cleanup and it would be ready to go. We made it
generic enough that it should work for just about anyone, with a few (minor)
caveats. We used it to transition a fairly flat, large mysql table into
HBase as well as for production backups and restore.

On Mon, Jul 7, 2008 at 9:43 PM, stack <st...@duboce.net> wrote:

> Sounds good.  Would it make sense your posting your code or is it too
> particular to your setup?
> St.Ack
>
>
> Dan Zinngrabe wrote:
>
>> We have something very similar, but a little more flexible in use in
>> production. Along with it is a simple exporter that outputs hbase data in
>> the same format.
>> It's not quite HBASE-50 <
>>
>> https://issues.apache.org/jira/browse/HBASE-50?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12580996#action_12580996
>>  but it does work pretty well so far for backing up even large tables.
>>
>> On Thu, Jul 3, 2008 at 10:48 AM, stack <st...@duboce.net> wrote:
>>
>>
>>
>>> Thanks Alex.  Looks great.  You want me to add it to the wiki?  (Or you
>>> could do it yourself).
>>>
>>> Yours does something slightly different it seems; you hardcode the column
>>> name and do a count of splits[1].  You might add to the class comment a
>>> description of what your MR job does.
>>>
>>> St.Ack
>>>
>>>
>>>
>>> Alex Newman wrote:
>>>
>>>
>>>
>>>> /// This is very close to the example in the javadoc
>>>> already(Bytes,BatchUpdate) instead of (text/mapwritable), and i find
>>>> it to be the easiest way to get people started/motivated with HBase.
>>>>
>>>>
>>>> package org.apache.hadoop.hbase.mapred;
>>>>
>>>> import org.apache.hadoop.hbase.util.Bytes;
>>>>
>>>> import java.io.IOException;
>>>> import java.util.Iterator;
>>>>
>>>> import org.apache.hadoop.conf.Configuration;
>>>> import org.apache.hadoop.fs.Path;
>>>> import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
>>>> import org.apache.hadoop.hbase.io.BatchUpdate;
>>>>
>>>> import org.apache.hadoop.io.LongWritable;
>>>> import org.apache.hadoop.io.MapWritable;
>>>> import org.apache.hadoop.io.Text;
>>>> import org.apache.hadoop.mapred.JobClient;
>>>> import org.apache.hadoop.mapred.JobConf;
>>>> import org.apache.hadoop.mapred.MapReduceBase;
>>>> import org.apache.hadoop.mapred.Mapper;
>>>> import org.apache.hadoop.mapred.OutputCollector;
>>>> import org.apache.hadoop.mapred.Reporter;
>>>> import org.apache.hadoop.util.Tool;
>>>> import org.apache.hadoop.util.ToolRunner;
>>>> public class SampleUploader extends MapReduceBase
>>>> implements Mapper<LongWritable, Text, ImmutableBytesWritable,
>>>> BatchUpdate>, Tool
>>>>  {
>>>>  private static final String NAME = "SampleUploader";
>>>>  private Configuration conf;
>>>>
>>>>  public JobConf createSubmittableJob(String[] args) {
>>>>   JobConf c = new JobConf(getConf(), SampleUploader.class);
>>>>   c.setJobName(NAME);
>>>>   c.setInputPath(new Path(args[0]));
>>>>   c.setMapperClass(this.getClass());
>>>>   c.setMapOutputKeyClass(ImmutableBytesWritable.class);
>>>>   c.setMapOutputValueClass(BatchUpdate.class);
>>>>   c.setReducerClass(TableUploader.class);
>>>>   TableReduce.initJob(args[1], TableUploader.class, c);
>>>>   return c;
>>>>  }
>>>>
>>>>  public void map(LongWritable k, Text v,
>>>>   OutputCollector<ImmutableBytesWritable, BatchUpdate> output, Reporter
>>>> r)
>>>>  throws IOException {
>>>>   // Lines are space-delimited; first item is row, next the columnname
>>>> and
>>>>  // then the third the cell value.
>>>>   String tmp = v.toString();
>>>>   if (tmp.length() == 0) {
>>>>     return;
>>>>   }
>>>>   String [] splits = v.toString().split(" ");
>>>>   String row = splits[0];
>>>>   BatchUpdate mw = new  BatchUpdate(row);
>>>>
>>>>   mw.put( "count:",  Bytes.toBytes(splits[1]));
>>>>   r.setStatus("Map emitting " + row + " for record " + k.toString());
>>>>   output.collect(new ImmutableBytesWritable(row.getBytes()), mw);
>>>>  }
>>>>
>>>>  public static class TableUploader extends
>>>> TableReduce<ImmutableBytesWritable,
>>>> BatchUpdate> {
>>>>
>>>>
>>>>  @Override
>>>>  public void reduce(ImmutableBytesWritable key, Iterator<BatchUpdate>
>>>> values,
>>>>     OutputCollector<ImmutableBytesWritable, BatchUpdate> output,
>>>>     @SuppressWarnings("unused") Reporter reporter)
>>>>     throws IOException {
>>>>
>>>>   while(values.hasNext()) {
>>>>     output.collect(key, values.next());
>>>>   }
>>>>  }
>>>> }
>>>>
>>>>
>>>>  static int printUsage() {
>>>>   System.out.println(NAME + " <input> <table_name>");
>>>>   return -1;
>>>>  }
>>>>
>>>>  public int run(@SuppressWarnings("unused") String[] args) throws
>>>> Exception {
>>>>   // Make sure there are exactly 2 parameters left.
>>>>   if (args.length != 2) {
>>>>     System.out.println("ERROR: Wrong number of parameters: " +
>>>>       args.length + " instead of 2.");
>>>>     return printUsage();
>>>>   }
>>>>   JobClient.runJob(createSubmittableJob(args));
>>>>   return 0;
>>>>  }
>>>>  System.out.println("ERROR: Wrong number of parameters: " +
>>>>       args.length + " instead of 2.");
>>>>     return printUsage();
>>>>   }
>>>>   JobClient.runJob(createSubmittableJob(args));
>>>>   return 0;
>>>>  }
>>>>
>>>>  public Configuration getConf() {
>>>>   return this.conf;
>>>>  }
>>>>
>>>>  public void setConf(final Configuration c) {
>>>>   this.conf = c;
>>>>  }
>>>>
>>>>  public static void main(String[] args) throws Exception {
>>>>   int errCode = ToolRunner.run(new Configuration(), new
>>>> SampleUploader(),
>>>>     args);
>>>>   System.exit(errCode);
>>>>  }
>>>> }
>>>>
>>>>
>>>>
>>>>
>>>
>>>
>>
>>
>>
>>
>
>


-- 
Dan Zinngrabe
Alchemist -- Mahalo.com
http://www.mahalo.com/member/quellish
dan@mahalo.com

Re: SampleUploader for the trunk hbase

Posted by stack <st...@duboce.net>.
Sounds good.  Would it make sense your posting your code or is it too 
particular to your setup?
St.Ack

Dan Zinngrabe wrote:
> We have something very similar, but a little more flexible in use in
> production. Along with it is a simple exporter that outputs hbase data in
> the same format.
> It's not quite HBASE-50 <
> https://issues.apache.org/jira/browse/HBASE-50?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12580996#action_12580996
>   
> but it does work pretty well so far for backing up even large tables.
>
> On Thu, Jul 3, 2008 at 10:48 AM, stack <st...@duboce.net> wrote:
>
>   
>> Thanks Alex.  Looks great.  You want me to add it to the wiki?  (Or you
>> could do it yourself).
>>
>> Yours does something slightly different it seems; you hardcode the column
>> name and do a count of splits[1].  You might add to the class comment a
>> description of what your MR job does.
>>
>> St.Ack
>>
>>
>>
>> Alex Newman wrote:
>>
>>     
>>> /// This is very close to the example in the javadoc
>>> already(Bytes,BatchUpdate) instead of (text/mapwritable), and i find
>>> it to be the easiest way to get people started/motivated with HBase.
>>>
>>>
>>> package org.apache.hadoop.hbase.mapred;
>>>
>>> import org.apache.hadoop.hbase.util.Bytes;
>>>
>>> import java.io.IOException;
>>> import java.util.Iterator;
>>>
>>> import org.apache.hadoop.conf.Configuration;
>>> import org.apache.hadoop.fs.Path;
>>> import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
>>> import org.apache.hadoop.hbase.io.BatchUpdate;
>>>
>>> import org.apache.hadoop.io.LongWritable;
>>> import org.apache.hadoop.io.MapWritable;
>>> import org.apache.hadoop.io.Text;
>>> import org.apache.hadoop.mapred.JobClient;
>>> import org.apache.hadoop.mapred.JobConf;
>>> import org.apache.hadoop.mapred.MapReduceBase;
>>> import org.apache.hadoop.mapred.Mapper;
>>> import org.apache.hadoop.mapred.OutputCollector;
>>> import org.apache.hadoop.mapred.Reporter;
>>> import org.apache.hadoop.util.Tool;
>>> import org.apache.hadoop.util.ToolRunner;
>>> public class SampleUploader extends MapReduceBase
>>> implements Mapper<LongWritable, Text, ImmutableBytesWritable,
>>> BatchUpdate>, Tool
>>>  {
>>>  private static final String NAME = "SampleUploader";
>>>  private Configuration conf;
>>>
>>>  public JobConf createSubmittableJob(String[] args) {
>>>    JobConf c = new JobConf(getConf(), SampleUploader.class);
>>>    c.setJobName(NAME);
>>>    c.setInputPath(new Path(args[0]));
>>>    c.setMapperClass(this.getClass());
>>>    c.setMapOutputKeyClass(ImmutableBytesWritable.class);
>>>    c.setMapOutputValueClass(BatchUpdate.class);
>>>    c.setReducerClass(TableUploader.class);
>>>    TableReduce.initJob(args[1], TableUploader.class, c);
>>>    return c;
>>>  }
>>>
>>>  public void map(LongWritable k, Text v,
>>>    OutputCollector<ImmutableBytesWritable, BatchUpdate> output, Reporter
>>> r)
>>>  throws IOException {
>>>    // Lines are space-delimited; first item is row, next the columnname
>>> and
>>>  // then the third the cell value.
>>>    String tmp = v.toString();
>>>    if (tmp.length() == 0) {
>>>      return;
>>>    }
>>>    String [] splits = v.toString().split(" ");
>>>    String row = splits[0];
>>>    BatchUpdate mw = new  BatchUpdate(row);
>>>
>>>    mw.put( "count:",  Bytes.toBytes(splits[1]));
>>>    r.setStatus("Map emitting " + row + " for record " + k.toString());
>>>    output.collect(new ImmutableBytesWritable(row.getBytes()), mw);
>>>  }
>>>
>>>  public static class TableUploader extends
>>> TableReduce<ImmutableBytesWritable,
>>> BatchUpdate> {
>>>
>>>
>>>  @Override
>>>  public void reduce(ImmutableBytesWritable key, Iterator<BatchUpdate>
>>> values,
>>>      OutputCollector<ImmutableBytesWritable, BatchUpdate> output,
>>>      @SuppressWarnings("unused") Reporter reporter)
>>>      throws IOException {
>>>
>>>    while(values.hasNext()) {
>>>      output.collect(key, values.next());
>>>    }
>>>  }
>>> }
>>>
>>>
>>>  static int printUsage() {
>>>    System.out.println(NAME + " <input> <table_name>");
>>>    return -1;
>>>  }
>>>
>>>  public int run(@SuppressWarnings("unused") String[] args) throws
>>> Exception {
>>>    // Make sure there are exactly 2 parameters left.
>>>    if (args.length != 2) {
>>>      System.out.println("ERROR: Wrong number of parameters: " +
>>>        args.length + " instead of 2.");
>>>      return printUsage();
>>>    }
>>>    JobClient.runJob(createSubmittableJob(args));
>>>    return 0;
>>>  }
>>>  System.out.println("ERROR: Wrong number of parameters: " +
>>>        args.length + " instead of 2.");
>>>      return printUsage();
>>>    }
>>>    JobClient.runJob(createSubmittableJob(args));
>>>    return 0;
>>>  }
>>>
>>>  public Configuration getConf() {
>>>    return this.conf;
>>>  }
>>>
>>>  public void setConf(final Configuration c) {
>>>    this.conf = c;
>>>  }
>>>
>>>  public static void main(String[] args) throws Exception {
>>>    int errCode = ToolRunner.run(new Configuration(), new SampleUploader(),
>>>      args);
>>>    System.exit(errCode);
>>>  }
>>> }
>>>
>>>
>>>       
>>     
>
>
>   


Re: SampleUploader for the trunk hbase

Posted by Dan Zinngrabe <da...@mahalo.com>.
We have something very similar, but a little more flexible in use in
production. Along with it is a simple exporter that outputs hbase data in
the same format.
It's not quite HBASE-50 <
https://issues.apache.org/jira/browse/HBASE-50?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12580996#action_12580996
>
but it does work pretty well so far for backing up even large tables.

On Thu, Jul 3, 2008 at 10:48 AM, stack <st...@duboce.net> wrote:

> Thanks Alex.  Looks great.  You want me to add it to the wiki?  (Or you
> could do it yourself).
>
> Yours does something slightly different it seems; you hardcode the column
> name and do a count of splits[1].  You might add to the class comment a
> description of what your MR job does.
>
> St.Ack
>
>
>
> Alex Newman wrote:
>
>> /// This is very close to the example in the javadoc
>> already(Bytes,BatchUpdate) instead of (text/mapwritable), and i find
>> it to be the easiest way to get people started/motivated with HBase.
>>
>>
>> package org.apache.hadoop.hbase.mapred;
>>
>> import org.apache.hadoop.hbase.util.Bytes;
>>
>> import java.io.IOException;
>> import java.util.Iterator;
>>
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
>> import org.apache.hadoop.hbase.io.BatchUpdate;
>>
>> import org.apache.hadoop.io.LongWritable;
>> import org.apache.hadoop.io.MapWritable;
>> import org.apache.hadoop.io.Text;
>> import org.apache.hadoop.mapred.JobClient;
>> import org.apache.hadoop.mapred.JobConf;
>> import org.apache.hadoop.mapred.MapReduceBase;
>> import org.apache.hadoop.mapred.Mapper;
>> import org.apache.hadoop.mapred.OutputCollector;
>> import org.apache.hadoop.mapred.Reporter;
>> import org.apache.hadoop.util.Tool;
>> import org.apache.hadoop.util.ToolRunner;
>> public class SampleUploader extends MapReduceBase
>> implements Mapper<LongWritable, Text, ImmutableBytesWritable,
>> BatchUpdate>, Tool
>>  {
>>  private static final String NAME = "SampleUploader";
>>  private Configuration conf;
>>
>>  public JobConf createSubmittableJob(String[] args) {
>>    JobConf c = new JobConf(getConf(), SampleUploader.class);
>>    c.setJobName(NAME);
>>    c.setInputPath(new Path(args[0]));
>>    c.setMapperClass(this.getClass());
>>    c.setMapOutputKeyClass(ImmutableBytesWritable.class);
>>    c.setMapOutputValueClass(BatchUpdate.class);
>>    c.setReducerClass(TableUploader.class);
>>    TableReduce.initJob(args[1], TableUploader.class, c);
>>    return c;
>>  }
>>
>>  public void map(LongWritable k, Text v,
>>    OutputCollector<ImmutableBytesWritable, BatchUpdate> output, Reporter
>> r)
>>  throws IOException {
>>    // Lines are space-delimited; first item is row, next the columnname
>> and
>>  // then the third the cell value.
>>    String tmp = v.toString();
>>    if (tmp.length() == 0) {
>>      return;
>>    }
>>    String [] splits = v.toString().split(" ");
>>    String row = splits[0];
>>    BatchUpdate mw = new  BatchUpdate(row);
>>
>>    mw.put( "count:",  Bytes.toBytes(splits[1]));
>>    r.setStatus("Map emitting " + row + " for record " + k.toString());
>>    output.collect(new ImmutableBytesWritable(row.getBytes()), mw);
>>  }
>>
>>  public static class TableUploader extends
>> TableReduce<ImmutableBytesWritable,
>> BatchUpdate> {
>>
>>
>>  @Override
>>  public void reduce(ImmutableBytesWritable key, Iterator<BatchUpdate>
>> values,
>>      OutputCollector<ImmutableBytesWritable, BatchUpdate> output,
>>      @SuppressWarnings("unused") Reporter reporter)
>>      throws IOException {
>>
>>    while(values.hasNext()) {
>>      output.collect(key, values.next());
>>    }
>>  }
>> }
>>
>>
>>  static int printUsage() {
>>    System.out.println(NAME + " <input> <table_name>");
>>    return -1;
>>  }
>>
>>  public int run(@SuppressWarnings("unused") String[] args) throws
>> Exception {
>>    // Make sure there are exactly 2 parameters left.
>>    if (args.length != 2) {
>>      System.out.println("ERROR: Wrong number of parameters: " +
>>        args.length + " instead of 2.");
>>      return printUsage();
>>    }
>>    JobClient.runJob(createSubmittableJob(args));
>>    return 0;
>>  }
>>  System.out.println("ERROR: Wrong number of parameters: " +
>>        args.length + " instead of 2.");
>>      return printUsage();
>>    }
>>    JobClient.runJob(createSubmittableJob(args));
>>    return 0;
>>  }
>>
>>  public Configuration getConf() {
>>    return this.conf;
>>  }
>>
>>  public void setConf(final Configuration c) {
>>    this.conf = c;
>>  }
>>
>>  public static void main(String[] args) throws Exception {
>>    int errCode = ToolRunner.run(new Configuration(), new SampleUploader(),
>>      args);
>>    System.exit(errCode);
>>  }
>> }
>>
>>
>
>


-- 
Dan Zinngrabe
Alchemist -- Mahalo.com
http://www.mahalo.com/member/quellish
dan@mahalo.com

Re: SampleUploader for the trunk hbase

Posted by stack <st...@duboce.net>.
Thanks Alex.  Looks great.  You want me to add it to the wiki?  (Or you 
could do it yourself).

Yours does something slightly different it seems; you hardcode the 
column name and do a count of splits[1].  You might add to the class 
comment a description of what your MR job does.

St.Ack


Alex Newman wrote:
> /// This is very close to the example in the javadoc
> already(Bytes,BatchUpdate) instead of (text/mapwritable), and i find
> it to be the easiest way to get people started/motivated with HBase.
>
>
> package org.apache.hadoop.hbase.mapred;
>
> import org.apache.hadoop.hbase.util.Bytes;
>
> import java.io.IOException;
> import java.util.Iterator;
>
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
> import org.apache.hadoop.hbase.io.BatchUpdate;
>
> import org.apache.hadoop.io.LongWritable;
> import org.apache.hadoop.io.MapWritable;
> import org.apache.hadoop.io.Text;
> import org.apache.hadoop.mapred.JobClient;
> import org.apache.hadoop.mapred.JobConf;
> import org.apache.hadoop.mapred.MapReduceBase;
> import org.apache.hadoop.mapred.Mapper;
> import org.apache.hadoop.mapred.OutputCollector;
> import org.apache.hadoop.mapred.Reporter;
> import org.apache.hadoop.util.Tool;
> import org.apache.hadoop.util.ToolRunner;
> public class SampleUploader extends MapReduceBase
> implements Mapper<LongWritable, Text, ImmutableBytesWritable, BatchUpdate>, Tool
>  {
>   private static final String NAME = "SampleUploader";
>   private Configuration conf;
>
>   public JobConf createSubmittableJob(String[] args) {
>     JobConf c = new JobConf(getConf(), SampleUploader.class);
>     c.setJobName(NAME);
>     c.setInputPath(new Path(args[0]));
>     c.setMapperClass(this.getClass());
>     c.setMapOutputKeyClass(ImmutableBytesWritable.class);
>     c.setMapOutputValueClass(BatchUpdate.class);
>     c.setReducerClass(TableUploader.class);
>     TableReduce.initJob(args[1], TableUploader.class, c);
>     return c;
>   }
>
>   public void map(LongWritable k, Text v,
>     OutputCollector<ImmutableBytesWritable, BatchUpdate> output, Reporter r)
>   throws IOException {
>     // Lines are space-delimited; first item is row, next the columnname and
>   // then the third the cell value.
>     String tmp = v.toString();
>     if (tmp.length() == 0) {
>       return;
>     }
>     String [] splits = v.toString().split(" ");
>     String row = splits[0];
>     BatchUpdate mw = new  BatchUpdate(row);
>
>     mw.put( "count:",  Bytes.toBytes(splits[1]));
>     r.setStatus("Map emitting " + row + " for record " + k.toString());
>     output.collect(new ImmutableBytesWritable(row.getBytes()), mw);
>   }
>
>   public static class TableUploader extends TableReduce<ImmutableBytesWritable,
> BatchUpdate> {
>
>
>   @Override
>   public void reduce(ImmutableBytesWritable key, Iterator<BatchUpdate> values,
>       OutputCollector<ImmutableBytesWritable, BatchUpdate> output,
>       @SuppressWarnings("unused") Reporter reporter)
>       throws IOException {
>
>     while(values.hasNext()) {
>       output.collect(key, values.next());
>     }
>   }
> }
>
>
>   static int printUsage() {
>     System.out.println(NAME + " <input> <table_name>");
>     return -1;
>   }
>
>   public int run(@SuppressWarnings("unused") String[] args) throws Exception {
>     // Make sure there are exactly 2 parameters left.
>     if (args.length != 2) {
>       System.out.println("ERROR: Wrong number of parameters: " +
>         args.length + " instead of 2.");
>       return printUsage();
>     }
>     JobClient.runJob(createSubmittableJob(args));
>     return 0;
>   }
>   System.out.println("ERROR: Wrong number of parameters: " +
>         args.length + " instead of 2.");
>       return printUsage();
>     }
>     JobClient.runJob(createSubmittableJob(args));
>     return 0;
>   }
>
>   public Configuration getConf() {
>     return this.conf;
>   }
>
>   public void setConf(final Configuration c) {
>     this.conf = c;
>   }
>
>   public static void main(String[] args) throws Exception {
>     int errCode = ToolRunner.run(new Configuration(), new SampleUploader(),
>       args);
>     System.exit(errCode);
>   }
> }
>