You are viewing a plain text version of this content. The canonical link for it is here.

Posted to common-user@hadoop.apache.org by Tri Doan <tr...@k-state.edu> on 2010/11/25 22:31:08 UTC

ask problem

Thurday 25 Nov 2010
Hi

I would like to write program to count frequency of word in collection of text files. First, i output every word in document and calculate number of words in that documents that will be output at the end with key is blank "". I expect combiner function will compute all the pair with key is blank (""), that will be the total number of words. so last step in reduce function will have total number of word whic can be use to calculate frequency of word in collection. Here is my program

import java.io.IOException;
import java.util.*;
        
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
        
public class thu {
        
 public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
    //private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();private Text id = new Text();
    private Text value = new Text();    
    public void map(LongWritable  key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
        String line = value.toString();
        int word_count=0;
        StringTokenizer tokenizer = new StringTokenizer(line);
        while (tokenizer.hasMoreTokens()) {
            word.set(tokenizer.nextToken());
            value.set(Integer.toString(1));   word_count++; 
            output.collect(word, value);
        }
        id.set(" ");value.set(Integer.toString(word_count)); 
        System.out.println(key.toString()+" has number of word "+word_count);
        output.collect(id, value);
    }
 } 
        
 public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text> {

    public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
    	double total_word_count=0 ;
        Text value = new Text();
        
       if ( key.toString()==" "   )
        	{	
    	   total_word_count=Double.valueOf(values.next().toString());
    	   System.out.println(key.toString()+" has total "+total_word_count);
        //	   total_word
       //  	total_word_count = value.;
        //    while (values.hasNext()) {
        //    	total_word_count += Double.valueOf(values.next().toString());
         //   }
        	}
        	int word_count =0;
        	 while (values.hasNext()) {
             	 word_count += Double.valueOf(values.next().toString());
             	// value.set(Double.toString(word_count/total_word_count));
             	 }
        value.set(Double.toString(word_count));
        System.out.println(key.toString()+" has "+Double.toString(word_count));
        output.collect(key,value);
   
  }
 }   
 
 public static class Combiner extends MapReduceBase implements Reducer<Text,Text, Text, Text> {

	    public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
	        int partial_sum = 0;
	        Text id = new Text();
			Text value = new Text();
			
	        while (values.hasNext()) {
	            partial_sum += Double.valueOf(values.next().toString());
	        }
	        value.set(Double.toString(partial_sum));
	        System.out.println(key.toString()+" has "+Double.toString(partial_sum));
	        output.collect(key, value);
	    }
	 }
 
 public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(thu.class);
    conf.setJobName("thu");
        
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
        
    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);
        
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
        
    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
    FileSystem.get(conf).delete(new Path(args[1]), true);
    
    JobClient.runJob(conf);
 }
        
}

however i obserse that combiner function seems does not work ? 
could you tell me what wrong.

Tri Doan
1429 Laramie Apt 3, Manhattan
KS 66502
USA

Re: ask problem

Posted by Senthil <se...@gmail.com>.

Should the line
   conf.setCombinerClass(Reduce.class);
be like
   conf.setCombinerClass(Combiner.class);

-Senthil

On Fri, Nov 26, 2010 at 3:01 AM, Tri Doan <tr...@k-state.edu> wrote:
> Thurday 25 Nov 2010
> Hi
>
> I would like to write program to count frequency of word in collection of text files. First, i output every word in document and calculate number of words in that documents that will be output at the end with key is blank "". I expect combiner function will compute all the pair with key is blank (""), that will be the total number of words. so last step in reduce function will have total number of word whic can be use to calculate frequency of word in collection. Here is my program
>
> import java.io.IOException;
> import java.util.*;
>
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
>
> public class thu {
>
>  public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
>    //private final static IntWritable one = new IntWritable(1);
>    private Text word = new Text();private Text id = new Text();
>    private Text value = new Text();
>    public void map(LongWritable  key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
>        String line = value.toString();
>        int word_count=0;
>        StringTokenizer tokenizer = new StringTokenizer(line);
>        while (tokenizer.hasMoreTokens()) {
>            word.set(tokenizer.nextToken());
>            value.set(Integer.toString(1));   word_count++;
>            output.collect(word, value);
>        }
>        id.set(" ");value.set(Integer.toString(word_count));
>        System.out.println(key.toString()+" has number of word "+word_count);
>        output.collect(id, value);
>    }
>  }
>
>  public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
>
>    public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
>        double total_word_count=0 ;
>        Text value = new Text();
>
>       if ( key.toString()==" "   )
>                {
>           total_word_count=Double.valueOf(values.next().toString());
>           System.out.println(key.toString()+" has total "+total_word_count);
>        //         total_word
>       //       total_word_count = value.;
>        //    while (values.hasNext()) {
>        //      total_word_count += Double.valueOf(values.next().toString());
>         //   }
>                }
>                int word_count =0;
>                 while (values.hasNext()) {
>                 word_count += Double.valueOf(values.next().toString());
>                // value.set(Double.toString(word_count/total_word_count));
>                 }
>        value.set(Double.toString(word_count));
>        System.out.println(key.toString()+" has "+Double.toString(word_count));
>        output.collect(key,value);
>
>  }
>  }
>
>  public static class Combiner extends MapReduceBase implements Reducer<Text,Text, Text, Text> {
>
>            public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
>                int partial_sum = 0;
>                Text id = new Text();
>                        Text value = new Text();
>
>                while (values.hasNext()) {
>                    partial_sum += Double.valueOf(values.next().toString());
>                }
>                value.set(Double.toString(partial_sum));
>                System.out.println(key.toString()+" has "+Double.toString(partial_sum));
>                output.collect(key, value);
>            }
>         }
>
>  public static void main(String[] args) throws Exception {
>    JobConf conf = new JobConf(thu.class);
>    conf.setJobName("thu");
>
>    conf.setOutputKeyClass(Text.class);
>    conf.setOutputValueClass(Text.class);
>
>    conf.setMapperClass(Map.class);
>    conf.setCombinerClass(Reduce.class);
>    conf.setReducerClass(Reduce.class);
>
>    conf.setInputFormat(TextInputFormat.class);
>    conf.setOutputFormat(TextOutputFormat.class);
>
>    FileInputFormat.setInputPaths(conf, new Path(args[0]));
>    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>    FileSystem.get(conf).delete(new Path(args[1]), true);
>
>    JobClient.runJob(conf);
>  }
>
> }
>
> however i obserse that combiner function seems does not work ?
> could you tell me what wrong.
>
> Tri Doan
> 1429 Laramie Apt 3, Manhattan
> KS 66502
> USA
>
>



-- 
Shanmugam Senthil

Re: ask problem

Posted by maha <ma...@umail.ucsb.edu>.

A much easier way is use the open source wordcount.java example and give it an input directory including all the text files. This will output one text file containing all the words and their frequencies from all the files. 

Maha

On Nov 25, 2010, at 1:31 PM, Tri Doan wrote:

> Thurday 25 Nov 2010
> Hi
> 
> I would like to write program to count frequency of word in collection of text files. First, i output every word in document and calculate number of words in that documents that will be output at the end with key is blank "". I expect combiner function will compute all the pair with key is blank (""), that will be the total number of words. so last step in reduce function will have total number of word whic can be use to calculate frequency of word in collection. Here is my program
> 
> import java.io.IOException;
> import java.util.*;
> 
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> 
> public class thu {
> 
> public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
>    //private final static IntWritable one = new IntWritable(1);
>    private Text word = new Text();private Text id = new Text();
>    private Text value = new Text();    
>    public void map(LongWritable  key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
>        String line = value.toString();
>        int word_count=0;
>        StringTokenizer tokenizer = new StringTokenizer(line);
>        while (tokenizer.hasMoreTokens()) {
>            word.set(tokenizer.nextToken());
>            value.set(Integer.toString(1));   word_count++; 
>            output.collect(word, value);
>        }
>        id.set(" ");value.set(Integer.toString(word_count)); 
>        System.out.println(key.toString()+" has number of word "+word_count);
>        output.collect(id, value);
>    }
> } 
> 
> public static class Reduce extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
> 
>    public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
>    	double total_word_count=0 ;
>        Text value = new Text();
> 
>       if ( key.toString()==" "   )
>        	{	
>    	   total_word_count=Double.valueOf(values.next().toString());
>    	   System.out.println(key.toString()+" has total "+total_word_count);
>        //	   total_word
>       //  	total_word_count = value.;
>        //    while (values.hasNext()) {
>        //    	total_word_count += Double.valueOf(values.next().toString());
>         //   }
>        	}
>        	int word_count =0;
>        	 while (values.hasNext()) {
>             	 word_count += Double.valueOf(values.next().toString());
>             	// value.set(Double.toString(word_count/total_word_count));
>             	 }
>        value.set(Double.toString(word_count));
>        System.out.println(key.toString()+" has "+Double.toString(word_count));
>        output.collect(key,value);
> 
>  }
> }   
> 
> public static class Combiner extends MapReduceBase implements Reducer<Text,Text, Text, Text> {
> 
> 	    public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
> 	        int partial_sum = 0;
> 	        Text id = new Text();
> 			Text value = new Text();
> 			
> 	        while (values.hasNext()) {
> 	            partial_sum += Double.valueOf(values.next().toString());
> 	        }
> 	        value.set(Double.toString(partial_sum));
> 	        System.out.println(key.toString()+" has "+Double.toString(partial_sum));
> 	        output.collect(key, value);
> 	    }
> 	 }
> 
> public static void main(String[] args) throws Exception {
>    JobConf conf = new JobConf(thu.class);
>    conf.setJobName("thu");
> 
>    conf.setOutputKeyClass(Text.class);
>    conf.setOutputValueClass(Text.class);
> 
>    conf.setMapperClass(Map.class);
>    conf.setCombinerClass(Reduce.class);
>    conf.setReducerClass(Reduce.class);
> 
>    conf.setInputFormat(TextInputFormat.class);
>    conf.setOutputFormat(TextOutputFormat.class);
> 
>    FileInputFormat.setInputPaths(conf, new Path(args[0]));
>    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>    FileSystem.get(conf).delete(new Path(args[1]), true);
> 
>    JobClient.runJob(conf);
> }
> 
> }
> 
> however i obserse that combiner function seems does not work ? 
> could you tell me what wrong.
> 
> Tri Doan
> 1429 Laramie Apt 3, Manhattan
> KS 66502
> USA
>