You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-user@hadoop.apache.org by Aaron Kimball <aa...@cloudera.com> on 2010/02/15 23:07:55 UTC

Re: a question on WordCount program failure

In your reducer method signature:

 public void reduce(Text key, Iterable<IntWritable> values,
                    org.apache.hadoop.mapreduce.
Mapper.Context context)


... why is this receiving a Mapper.Context? This means that it doesn't
actually override the default reduce() method. The default reduce() method
is going to be an identity reducer. Try adding an '@Override' decorator, and
you'll see that this suddenly won't compile (or will at least give you a
warning).

That parameter should be a Reducer.Context. (Note that actually since
WordCountMapper extends Mapper, and WordCountReducer extends Reducer, you
should just be able to write "Context" in both of those places and that'll
be that.

e.g.
public void map(LongWritable key, Text value, Context context) throws
IOException, java.lang.InterruptedException {

and
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, java.lang.InterruptedException {

I also noticed that the job configuration step doesn't actually set the
number of reduce tasks. It might be that you haven't configured it to run
the reducer in the first place. What happens if you add
'job.setNumReduceTasks(10)' to your main() method?

- Aaron

On Mon, Feb 15, 2010 at 12:27 AM, Taylor, Ronald C <ro...@pnl.gov>wrote:

>
> Hello,
>
> I just joined the list and got a newbie question. Operating on a 10-node
> Linux cluster running Hadoop 0.20.1, I've been trying out the WordCount
> program.
>
> I have three files: WordCount.java, WordCountMapper.java, and
> WordCountReducer.java. The contents of those three files are listed in full
> at bottom.
>
> Compilation, jarring and invocation appear to work fine, when done as
> follows:
>
> javac WordCountMapper.java
> javac WordCountReducer.java
> javac WordCount.java
>
> jar cf jarredWordCount.jar WordCountMapper.class WordCountReducer.class
> WordCount.class
>
> Invocation:
> hadoop jar jarredWordCount.jar WordCount
> "/user/rtaylor/WordCountInputDirectory" "/user/rtaylor/OutputDirectory"
>
> %%%
>
> However, the results are not what I expect. Here is partial listing from
> one of the output files:
>
> artillery       1
> barged  1
> call    1
> coalition       1
> coalition       1
> demonstrated    1
> get     1
> has     1
> has     1
>
> I was expecting, for example, to get one line for "coalition",  like so:
>
> coalition 2
>
> Instead I get the two (non-summed) lines that you see above.
>
> I've tried several changes, with no effect. I still get the same (wrong)
> output with no word summation. This is trying me nuts, especially since I
> presume that I am making a simple mistake that somebody should be able to be
> spot easily. So - please help!
>
>   - Ron Taylor
> ___________________________________________
> Ronald Taylor, Ph.D.
> Computational Biology & Bioinformatics Group Pacific Northwest National
> Laboratory
> 902 Battelle Boulevard
> P.O. Box 999, Mail Stop J4-33
> Richland, WA  99352 USA
> Office:  509-372-6568
> Email: ronald.taylor@pnl.gov
>
> %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
>
> contents of WordCount.java:
>
> import java.io.*;
> import java.util.*;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapreduce.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapreduce.lib.input.*;
> import org.apache.hadoop.mapreduce.lib.output.*;
>
> public class WordCount {
>
>    public static void main(String[] args)
>        throws java.io.IOException,
>               java.lang.InterruptedException,
>               java.lang.ClassNotFoundException {
>
>    org.apache.hadoop.conf.Configuration conf = new
> org.apache.hadoop.conf.Configuration();
>
>    String[] otherArgs = new
> org.apache.hadoop.util.GenericOptionsParser(conf, args).getRemainingArgs();
>       if (otherArgs.length != 2) {
>             System.err.println("Error in parameter inputs - Usage:
> WordCount <in> <out>");
>             System.exit(2);
>       }
>    String inputDirectory   = otherArgs[0];
>    String outputDirectory  = otherArgs[1];
>
>    Job job = new Job(conf, "WordCount");
>    job.setJarByClass(WordCount.class);
>    job.setMapperClass(WordCountMapper.class);
>    job.setCombinerClass(WordCountReducer.class);
>    job.setReducerClass(WordCountReducer.class);
>    job.setOutputKeyClass(Text.class);
>    job.setOutputValueClass(IntWritable.class);
>    FileInputFormat.addInputPath(job, new Path(inputDirectory));
>    FileOutputFormat.setOutputPath(job, new Path(outputDirectory));
>    System.exit(job.waitForCompletion(true) ? 0 : 1);
>  }
>
> }
>
> %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
>
> contents of WordCountMapper.java:
>
> import java.io.*;
> import java.util.*;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapreduce.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapreduce.lib.input.*;
> import org.apache.hadoop.mapreduce.lib.output.*;
>
> public class WordCountMapper extends org.apache.hadoop.mapreduce.Mapper
> <LongWritable, Text, Text, IntWritable> {
>  private final IntWritable one = new IntWritable(1);
>  private Text word = new Text();
>
>    public void map(LongWritable key, Text value,
> org.apache.hadoop.mapreduce.Mapper.Context context)
>                        throws IOException, java.lang.InterruptedException {
>    String line = value.toString();
>    StringTokenizer itr = new StringTokenizer(line.toLowerCase());
>    while(itr.hasMoreTokens()) {
>      word.set(itr.nextToken());
>      context.write(word, one);
>    }
>  }
> }
>
> %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
>
> contents of WordCountReducer.java:
>
> import java.io.*;
> import java.util.*;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapreduce.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapreduce.lib.input.*;
> import org.apache.hadoop.mapreduce.lib.output.*;
>
> public class WordCountReducer extends Reducer<Text, IntWritable, Text,
> IntWritable> {
>    private IntWritable result = new IntWritable();
>
>  public void reduce(Text key, Iterable<IntWritable> values,
>                     org.apache.hadoop.mapreduce.Mapper.Context context)
>                        throws IOException, java.lang.InterruptedException {
>    int sum = 0;
>    for (IntWritable val : values) {
>      int value = val.get();
>      sum += value;
>    }
>    result.set(sum);
>    context.write(key, result);
>  }
> }
>
> %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
>

RE: a question on WordCount program failure

Posted by "Taylor, Ronald C" <ro...@pnl.gov>.
As a reference for other new users of the API that comes with Hadoop 0.20 that uses Context objects, I should clarify one thing, which may not have been clear from my earlier thank-you to Aaron:

I found that using
   public void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper.Context context)
in the Mapper class

and using
    public void reduce(Text key, Iterable<IntWritable> values, org.apache.hadoop.mapreduce.Reducer.Context context)
in the Reduce class

does NOT work. The WordCount program that I tried only works if the methods are started like so:

  public void map(LongWritable key, Text value, Context context)
in the Mapper class

and using
    public void reduce(Text key, Iterable<IntWritable> values, Context context)
in the Reduce class

That surprised me a bit, given how I interpreted Aaron's email, but there you go.

 - Ron

________________________________
From: Taylor, Ronald C
Sent: Monday, February 15, 2010 2:53 PM
To: 'Aaron Kimball'; mapreduce-user@hadoop.apache.org
Cc: Taylor, Ronald C
Subject: RE: a question on WordCount program failure

Aaron,

I made the changes that you suggested and the program now runs fine. So - I'm off and running. Thanks very much!
 Ron

________________________________
From: Aaron Kimball [mailto:aaron@cloudera.com]
Sent: Monday, February 15, 2010 2:08 PM
To: mapreduce-user@hadoop.apache.org
Cc: Taylor, Ronald C
Subject: Re: a question on WordCount program failure

In your reducer method signature:

 public void reduce(Text key, Iterable<IntWritable> values,
                    org.apache.hadoop.mapreduce.
Mapper.Context context)


... why is this receiving a Mapper.Context? This means that it doesn't actually override the default reduce() method. The default reduce() method is going to be an identity reducer. Try adding an '@Override' decorator, and you'll see that this suddenly won't compile (or will at least give you a warning).

That parameter should be a Reducer.Context. (Note that actually since WordCountMapper extends Mapper, and WordCountReducer extends Reducer, you should just be able to write "Context" in both of those places and that'll be that.

e.g.
public void map(LongWritable key, Text value, Context context) throws IOException, java.lang.InterruptedException {

and
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, java.lang.InterruptedException {

I also noticed that the job configuration step doesn't actually set the number of reduce tasks. It might be that you haven't configured it to run the reducer in the first place. What happens if you add 'job.setNumReduceTasks(10)' to your main() method?

- Aaron

On Mon, Feb 15, 2010 at 12:27 AM, Taylor, Ronald C <ro...@pnl.gov>> wrote:

Hello,

I just joined the list and got a newbie question. Operating on a 10-node Linux cluster running Hadoop 0.20.1, I've been trying out the WordCount program.

I have three files: WordCount.java, WordCountMapper.java, and WordCountReducer.java. The contents of those three files are listed in full at bottom.

Compilation, jarring and invocation appear to work fine, when done as follows:

javac WordCountMapper.java
javac WordCountReducer.java
javac WordCount.java

jar cf jarredWordCount.jar WordCountMapper.class WordCountReducer.class WordCount.class

Invocation:
hadoop jar jarredWordCount.jar WordCount "/user/rtaylor/WordCountInputDirectory" "/user/rtaylor/OutputDirectory"

%%%

However, the results are not what I expect. Here is partial listing from one of the output files:

artillery       1
barged  1
call    1
coalition       1
coalition       1
demonstrated    1
get     1
has     1
has     1

I was expecting, for example, to get one line for "coalition",  like so:

coalition 2

Instead I get the two (non-summed) lines that you see above.

I've tried several changes, with no effect. I still get the same (wrong) output with no word summation. This is trying me nuts, especially since I presume that I am making a simple mistake that somebody should be able to be spot easily. So - please help!

  - Ron Taylor
___________________________________________
Ronald Taylor, Ph.D.
Computational Biology & Bioinformatics Group Pacific Northwest National Laboratory
902 Battelle Boulevard
P.O. Box 999, Mail Stop J4-33
Richland, WA  99352 USA
Office:  509-372-6568
Email: ronald.taylor@pnl.gov<ma...@pnl.gov>

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

contents of WordCount.java:

import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;

public class WordCount {

   public static void main(String[] args)
       throws java.io.IOException,
              java.lang.InterruptedException,
              java.lang.ClassNotFoundException {

   org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();

   String[] otherArgs = new org.apache.hadoop.util.GenericOptionsParser(conf, args).getRemainingArgs();
      if (otherArgs.length != 2) {
            System.err.println("Error in parameter inputs - Usage: WordCount <in> <out>");
            System.exit(2);
      }
   String inputDirectory   = otherArgs[0];
   String outputDirectory  = otherArgs[1];

   Job job = new Job(conf, "WordCount");
   job.setJarByClass(WordCount.class);
   job.setMapperClass(WordCountMapper.class);
   job.setCombinerClass(WordCountReducer.class);
   job.setReducerClass(WordCountReducer.class);
   job.setOutputKeyClass(Text.class);
   job.setOutputValueClass(IntWritable.class);
   FileInputFormat.addInputPath(job, new Path(inputDirectory));
   FileOutputFormat.setOutputPath(job, new Path(outputDirectory));
   System.exit(job.waitForCompletion(true) ? 0 : 1);
 }

}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

contents of WordCountMapper.java:

import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;

public class WordCountMapper extends org.apache.hadoop.mapreduce.Mapper <LongWritable, Text, Text, IntWritable> {
 private final IntWritable one = new IntWritable(1);
 private Text word = new Text();

   public void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper.Context context)
                       throws IOException, java.lang.InterruptedException {
   String line = value.toString();
   StringTokenizer itr = new StringTokenizer(line.toLowerCase());
   while(itr.hasMoreTokens()) {
     word.set(itr.nextToken());
     context.write(word, one);
   }
 }
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

contents of WordCountReducer.java:

import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;

public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
   private IntWritable result = new IntWritable();

 public void reduce(Text key, Iterable<IntWritable> values,
                    org.apache.hadoop.mapreduce.Mapper.Context context)
                       throws IOException, java.lang.InterruptedException {
   int sum = 0;
   for (IntWritable val : values) {
     int value = val.get();
     sum += value;
   }
   result.set(sum);
   context.write(key, result);
 }
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


RE: a question on WordCount program failure

Posted by "Taylor, Ronald C" <ro...@pnl.gov>.
Aaron,

I made the changes that you suggested and the program now runs fine. So - I'm off and running. Thanks very much!
 Ron

________________________________
From: Aaron Kimball [mailto:aaron@cloudera.com]
Sent: Monday, February 15, 2010 2:08 PM
To: mapreduce-user@hadoop.apache.org
Cc: Taylor, Ronald C
Subject: Re: a question on WordCount program failure

In your reducer method signature:

 public void reduce(Text key, Iterable<IntWritable> values,
                    org.apache.hadoop.mapreduce.
Mapper.Context context)


... why is this receiving a Mapper.Context? This means that it doesn't actually override the default reduce() method. The default reduce() method is going to be an identity reducer. Try adding an '@Override' decorator, and you'll see that this suddenly won't compile (or will at least give you a warning).

That parameter should be a Reducer.Context. (Note that actually since WordCountMapper extends Mapper, and WordCountReducer extends Reducer, you should just be able to write "Context" in both of those places and that'll be that.

e.g.
public void map(LongWritable key, Text value, Context context) throws IOException, java.lang.InterruptedException {

and
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, java.lang.InterruptedException {

I also noticed that the job configuration step doesn't actually set the number of reduce tasks. It might be that you haven't configured it to run the reducer in the first place. What happens if you add 'job.setNumReduceTasks(10)' to your main() method?

- Aaron

On Mon, Feb 15, 2010 at 12:27 AM, Taylor, Ronald C <ro...@pnl.gov>> wrote:

Hello,

I just joined the list and got a newbie question. Operating on a 10-node Linux cluster running Hadoop 0.20.1, I've been trying out the WordCount program.

I have three files: WordCount.java, WordCountMapper.java, and WordCountReducer.java. The contents of those three files are listed in full at bottom.

Compilation, jarring and invocation appear to work fine, when done as follows:

javac WordCountMapper.java
javac WordCountReducer.java
javac WordCount.java

jar cf jarredWordCount.jar WordCountMapper.class WordCountReducer.class WordCount.class

Invocation:
hadoop jar jarredWordCount.jar WordCount "/user/rtaylor/WordCountInputDirectory" "/user/rtaylor/OutputDirectory"

%%%

However, the results are not what I expect. Here is partial listing from one of the output files:

artillery       1
barged  1
call    1
coalition       1
coalition       1
demonstrated    1
get     1
has     1
has     1

I was expecting, for example, to get one line for "coalition",  like so:

coalition 2

Instead I get the two (non-summed) lines that you see above.

I've tried several changes, with no effect. I still get the same (wrong) output with no word summation. This is trying me nuts, especially since I presume that I am making a simple mistake that somebody should be able to be spot easily. So - please help!

  - Ron Taylor
___________________________________________
Ronald Taylor, Ph.D.
Computational Biology & Bioinformatics Group Pacific Northwest National Laboratory
902 Battelle Boulevard
P.O. Box 999, Mail Stop J4-33
Richland, WA  99352 USA
Office:  509-372-6568
Email: ronald.taylor@pnl.gov<ma...@pnl.gov>

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

contents of WordCount.java:

import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;

public class WordCount {

   public static void main(String[] args)
       throws java.io.IOException,
              java.lang.InterruptedException,
              java.lang.ClassNotFoundException {

   org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();

   String[] otherArgs = new org.apache.hadoop.util.GenericOptionsParser(conf, args).getRemainingArgs();
      if (otherArgs.length != 2) {
            System.err.println("Error in parameter inputs - Usage: WordCount <in> <out>");
            System.exit(2);
      }
   String inputDirectory   = otherArgs[0];
   String outputDirectory  = otherArgs[1];

   Job job = new Job(conf, "WordCount");
   job.setJarByClass(WordCount.class);
   job.setMapperClass(WordCountMapper.class);
   job.setCombinerClass(WordCountReducer.class);
   job.setReducerClass(WordCountReducer.class);
   job.setOutputKeyClass(Text.class);
   job.setOutputValueClass(IntWritable.class);
   FileInputFormat.addInputPath(job, new Path(inputDirectory));
   FileOutputFormat.setOutputPath(job, new Path(outputDirectory));
   System.exit(job.waitForCompletion(true) ? 0 : 1);
 }

}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

contents of WordCountMapper.java:

import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;

public class WordCountMapper extends org.apache.hadoop.mapreduce.Mapper <LongWritable, Text, Text, IntWritable> {
 private final IntWritable one = new IntWritable(1);
 private Text word = new Text();

   public void map(LongWritable key, Text value, org.apache.hadoop.mapreduce.Mapper.Context context)
                       throws IOException, java.lang.InterruptedException {
   String line = value.toString();
   StringTokenizer itr = new StringTokenizer(line.toLowerCase());
   while(itr.hasMoreTokens()) {
     word.set(itr.nextToken());
     context.write(word, one);
   }
 }
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

contents of WordCountReducer.java:

import java.io.*;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.*;

public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
   private IntWritable result = new IntWritable();

 public void reduce(Text key, Iterable<IntWritable> values,
                    org.apache.hadoop.mapreduce.Mapper.Context context)
                       throws IOException, java.lang.InterruptedException {
   int sum = 0;
   for (IntWritable val : values) {
     int value = val.get();
     sum += value;
   }
   result.set(sum);
   context.write(key, result);
 }
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%