You are viewing a plain text version of this content. The canonical link for it is here.

Posted to mapreduce-user@hadoop.apache.org by Ranjini Rathinam <ra...@gmail.com> on 2014/03/19 13:50:07 UTC

Need FileName with Content

Hi,

I have folder named INPUT.

Inside INPUT i have 5 resume are there.

hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
Found 5 items
-rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
/user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
-rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
/user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
-rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
/user/hduser/INPUT/vinitha.txt
-rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
/user/hduser/INPUT/sony.txt
-rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
/user/hduser/INPUT/ravi.txt
hduser@localhost:~/Ranjini$

I have to process the folder and the content .

I need ouput has

filename   word   occurance
vinitha       java       4
sony          oracle      3



But iam not getting the filename.  Has the input file content are merged
file name is not getting correct .


please help in this issue to fix.  I have given by code below


 import java.io.IOException;
 import java.util.*;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.mapred.*;
 import org.apache.hadoop.util.*;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.mapred.lib.*;

 public class WordCount {
    public static class Map extends MapReduceBase implements
Mapper<LongWritable, Text, Text, IntWritable> {
     private final static IntWritable one = new IntWritable(1);
      private Text word = new Text();
      public void map(LongWritable key, Text value, OutputCollector<Text,
IntWritable> output, Reporter reporter) throws IOException {
   FSDataInputStream fs=null;
   FileSystem hdfs = null;
   String line = value.toString();
         int i=0,k=0;
  try{
   Configuration configuration = new Configuration();
      configuration.set("fs.default.name", "hdfs://localhost:4440/");

   Path srcPath = new Path("/user/hduser/INPUT/");

   hdfs = FileSystem.get(configuration);
   FileStatus[] status = hdfs.listStatus(srcPath);
   fs=hdfs.open(srcPath);
   BufferedReader br=new BufferedReader(new
InputStreamReader(hdfs.open(srcPath)));

String[] splited = line.split("\\s <file://s/>+");
    for( i=0;i<splited.length;i++)
 {
     String sp[]=splited[i].split(",");
     for( k=0;k<sp.length;k++)
 {

   if(!sp[k].isEmpty()){
StringTokenizer tokenizer = new StringTokenizer(sp[k]);
if((sp[k].equalsIgnoreCase("C"))){
        while (tokenizer.hasMoreTokens()) {
          word.set(tokenizer.nextToken());
          output.collect(word, one);
        }
}
if((sp[k].equalsIgnoreCase("JAVA"))){
        while (tokenizer.hasMoreTokens()) {
          word.set(tokenizer.nextToken());
          output.collect(word, one);
        }
}
      }
    }
}
 } catch (IOException e) {
    e.printStackTrace();
 }
}
}
    public static class Reduce extends MapReduceBase implements
Reducer<Text, IntWritable, Text, IntWritable> {
      public void reduce(Text key, Iterator<IntWritable> values,
OutputCollector<Text, IntWritable> output, Reporter reporter) throws
IOException {
        int sum = 0;
        while (values.hasNext()) {
          sum += values.next().get();
        }
        output.collect(key, new IntWritable(sum));
      }
    }
    public static void main(String[] args) throws Exception {


      JobConf conf = new JobConf(WordCount.class);
      conf.setJobName("wordcount");
      conf.setOutputKeyClass(Text.class);
      conf.setOutputValueClass(IntWritable.class);
      conf.setMapperClass(Map.class);
      conf.setCombinerClass(Reduce.class);
      conf.setReducerClass(Reduce.class);
      conf.setInputFormat(TextInputFormat.class);
      conf.setOutputFormat(TextOutputFormat.class);
      FileInputFormat.setInputPaths(conf, new Path(args[0]));
      FileOutputFormat.setOutputPath(conf, new Path(args[1]));
      JobClient.runJob(conf);
    }
 }



Please help

Thanks in advance.

Ranjini

Re: Need FileName with Content

Posted by Felix Chern <id...@gmail.com>.

I've written two blog post of how to get directory context in hadoop mapper.

http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/

Cheers,
Felix

On Mar 19, 2014, at 10:50 PM, Ranjini Rathinam <ra...@gmail.com> wrote:

> Hi,
>  
> If we give the below code,
> =======================
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> ======================
>  
> The output is wrong. because it shows the
>  
> filename   word   occurance
> vinitha       java       4
> vinitha         oracle      3
> sony           java       4
> sony          oracle      3
>  
>  
> Here vinitha does not have oracle word . Similarlly sony does not have java has word. File name is merging for  all words.
>  
> I need the output has given below
>  
> filename   word   occurance
> 
> vinitha       java       4
> vinitha         C++    3
> sony           ETL     4
> sony          oracle      3
>  
>  
>  Need fileaName along with the word in that particular file only. No merge should happen.
>  
> Please help me out for this issue.
>  
> Please help.
>  
> Thanks in advance.
>  
> Ranjini
>  
>  
> 
>  
> On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam <ra...@gmail.com> wrote:
> 
> 
> ---------- Forwarded message ----------
> From: Stanley Shi <ss...@gopivotal.com>
> Date: Thu, Mar 20, 2014 at 7:39 AM
> Subject: Re: Need FileName with Content
> To: user@hadoop.apache.org
> 
> 
> You want to do a word count for each file, but the code give you a word count for all the files, right?
> 
> =====
> word.set(tokenizer.nextToken());
>           output.collect(word, one);
> ======
> change it to:
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> 
> 
> 
> 
> Regards,
> Stanley Shi,
> 
> 
> 
> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ra...@gmail.com> wrote:
> Hi,
> 
> I have folder named INPUT.
> 
> Inside INPUT i have 5 resume are there.
> 
> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
> Found 5 items
> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20 /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22 /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21 /user/hduser/INPUT/vinitha.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21 /user/hduser/INPUT/sony.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21 /user/hduser/INPUT/ravi.txt
> hduser@localhost:~/Ranjini$ 
> 
> I have to process the folder and the content .
> 
> I need ouput has 
> 
> filename   word   occurance
> vinitha       java       4
> sony          oracle      3
> 
> 
> 
> But iam not getting the filename.  Has the input file content are merged file name is not getting correct .
> 
> 
> please help in this issue to fix.  I have given by code below
>  
>  
>  import java.io.IOException;
>  import java.util.*;
>  import org.apache.hadoop.fs.Path;
>  import org.apache.hadoop.conf.*;
>  import org.apache.hadoop.io.*;
>  import org.apache.hadoop.mapred.*;
>  import org.apache.hadoop.util.*;
> import java.io.File;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapred.lib.*;
> 
>  public class WordCount {
>     public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
>      private final static IntWritable one = new IntWritable(1);
>       private Text word = new Text();
>       public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
>    FSDataInputStream fs=null;
>    FileSystem hdfs = null;
>    String line = value.toString();
>          int i=0,k=0;
>   try{
>    Configuration configuration = new Configuration();
>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>    
>    Path srcPath = new Path("/user/hduser/INPUT/");
>    
>    hdfs = FileSystem.get(configuration);
>    FileStatus[] status = hdfs.listStatus(srcPath);
>    fs=hdfs.open(srcPath);
>    BufferedReader br=new BufferedReader(new InputStreamReader(hdfs.open(srcPath)));
>    
> String[] splited = line.split("\\s+");
>     for( i=0;i<splited.length;i++)
>  {
>      String sp[]=splited[i].split(",");
>      for( k=0;k<sp.length;k++)
>  {
>      
>    if(!sp[k].isEmpty()){
> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
> if((sp[k].equalsIgnoreCase("C"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
> if((sp[k].equalsIgnoreCase("JAVA"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
>       }
>     }
> }
>  } catch (IOException e) {
>     e.printStackTrace();
>  } 
> }
> }
>     public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
>       public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>         output.collect(key, new IntWritable(sum));
>       }
>     }
>     public static void main(String[] args) throws Exception {
>  
>  
>       JobConf conf = new JobConf(WordCount.class);
>       conf.setJobName("wordcount");
>       conf.setOutputKeyClass(Text.class);
>       conf.setOutputValueClass(IntWritable.class);
>       conf.setMapperClass(Map.class);
>       conf.setCombinerClass(Reduce.class);
>       conf.setReducerClass(Reduce.class);
>       conf.setInputFormat(TextInputFormat.class);
>       conf.setOutputFormat(TextOutputFormat.class);
>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>       JobClient.runJob(conf);
>     }
>  }
>  
> Please help
>  
> Thanks in advance.
>  
> Ranjini
> 
> 
> 
> 
>

Re: Need FileName with Content

Posted by Shahab Yunus <sh...@gmail.com>.

If this parameter is at the job level (i.e. for the whole run level) then
you can set this value int the Configuration object to pass it on to the
mappers.
http://www.thecloudavenue.com/2011/11/passing-parameters-to-mappers-and.html

Regards,
Shahab


On Fri, Mar 21, 2014 at 7:08 AM, Ranjini Rathinam <ra...@gmail.com>wrote:

> Hi,
>
> Thanks for the great support i have fixed the issue. I have now got
> the output.
>
> But , i have one query ,Possible to give runtime argument for mapper class
>
> like,
>
> Giving the value C,JAVA in runtime.
>
>
>
> * if((sp[k].equalsIgnoreCase("C"))){*
>                                     while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>                                         }
>                                     }
>
> *    if((sp[k].equalsIgnoreCase("JAVA"))){*
>                                          while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>
>  Thanks a lot .
>
> Ranjini
>
>
>
> On Fri, Mar 21, 2014 at 11:45 AM, Ranjini Rathinam <ranjinibecse@gmail.com
> > wrote:
>
>> Hi,
>>
>>
>> Thanks a lot for the great support. I am just learning hadoop and
>> mapreduce.
>>
>> I have used the way you have guided me.
>>
>> But the output is coming without Aggreating
>>
>> vinitha.txt C    1
>> vinitha.txt Java    1
>> vinitha.txt Java    1
>> vinitha.txt Java    1
>> vinitha.txt Java    1
>>
>>
>> *I need the output has *
>>
>>  *vinitha       C    1*
>>
>> *vinitha      Java  4*
>>
>>
>> I have reduce class but still not able to fix it, I am still trying .
>>
>> I have given my code below, Please let me know where i have gone wrong.
>>
>>
>> my code
>>
>>
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.io.LongWritable;
>> import org.apache.hadoop.io.*;
>> import org.apache.hadoop.io.Text;
>> import org.apache.hadoop.mapreduce.InputSplit;
>> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>> import org.apache.hadoop.mapreduce.*;
>> import org.apache.hadoop.mapreduce.Job;
>> import org.apache.hadoop.mapreduce.Mapper;
>> import org.apache.hadoop.mapreduce.Reducer;
>> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>>
>> import java.io.IOException;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.FileStatus;
>> import java.util.*;
>> import java.util.logging.Level;
>> import java.util.logging.Logger;
>>
>>  public class FileCount {
>>     public static class TokenizerMapper extends Mapper<LongWritable,
>> Text, Text, IntWritable> {
>>
>>
>>     private final static IntWritable one = new IntWritable(1);
>>
>>     private Text word = new Text();
>>
>>
>>     public void map(LongWritable key, Text value, Context context) throws
>> IOException, InterruptedException {
>>
>>             FileSplit fileSplit;
>>               InputSplit is = context.getInputSplit();
>>               FileSystem fs = FileSystem.get(context.getConfiguration());
>>               fileSplit = (FileSplit) is;
>>               Path pp = fileSplit.getPath();
>>                     String line=value.toString();
>>                     int i=0;int k=0;
>>                     //Path pp = ((FileSplit)
>> context.getInputSplit()).getPath();
>>
>>                     String[] splited = line.split("\\s+");
>>                         for( i=0;i<splited.length;i++)
>>                             {
>>                                  String sp[]=splited[i].split(",");
>>                          for( k=0;k<sp.length;k++)
>>                             {
>>
>>                                if(!sp[k].isEmpty())
>>                             {
>>
>>                                   StringTokenizer itr = new
>> StringTokenizer(sp[k]);
>>
>>                                   //log.info("map on string: " + new
>> String(value.getBytes()));
>>
>>                                 if((sp[k].equalsIgnoreCase("C"))){
>>                                     while (itr.hasMoreTokens()) {
>>                                            word.set(pp.getName() + " " +
>> itr.nextToken());
>>
>>                                         context.write(word, one);
>>                                         }
>>                                     }
>>                                 if((sp[k].equalsIgnoreCase("JAVA"))){
>>                                          while (itr.hasMoreTokens()) {
>>                                            word.set(pp.getName() + " " +
>> itr.nextToken());
>>
>>                                         context.write(word, one);
>>                                         }
>>                                 }
>>                              }
>>                             }
>>                         }
>>
>>           }
>>
>>   }
>>
>>   public static class Reduce extends Reducer<Text, IntWritable, Text,
>> IntWritable> {
>>
>>     public void reduce(Text key, Iterator<IntWritable> values, Context
>> context) throws IOException, InterruptedException {
>>
>>
>>         int sum = 0;
>>         while (values.hasNext()) {
>>           sum += values.next().get();
>>         }
>>        context.write(key, new IntWritable(sum));
>>
>>       }
>>     }
>>     public static void main(String[] args) throws Exception {
>>             Configuration conf = new Configuration();
>> Job job = new Job(conf, "jobName");
>>
>> String input="/user/hduser/INPUT/";
>> String output="/user/hduser/OUTPUT/";
>> FileInputFormat.setInputPaths(job, input);
>> job.setJarByClass(FileCount.class);
>> job.setMapperClass(TokenizerMapper.class);
>> job.setReducerClass(Reduce.class);
>> job.setCombinerClass(Reduce.class);
>> job.setInputFormatClass(TextInputFormat.class);
>> job.setOutputKeyClass(Text.class);
>> job.setOutputValueClass(IntWritable.class);
>> Path outPath = new Path(output);
>> FileOutputFormat.setOutputPath(job, outPath);
>> FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
>> if (dfs.exists(outPath)) {
>> dfs.delete(outPath, true);
>> }
>>
>>
>> try {
>>
>> job.waitForCompletion(true);
>>
>> } catch (InterruptedException ex) {
>> //Logger.getLogger(FileCOunt.class.getName()).log(Level.SEVERE, null, ex);
>> } catch (ClassNotFoundException ex) {
>> //Logger.getLogger(FileCount.class.getName()).log(Level.SEVERE, null, ex);
>> }
>>
>> }
>>
>> }
>>
>>
>> Thanks in advance for the great help and support to fix the issue .
>>
>> Please help to fix it.
>>
>> Thanks a lot.
>>
>> Regards,
>> Ranjini
>>
>>
>>> Hi,
>>>
>>> I have folder named INPUT.
>>>
>>> Inside INPUT i have 5 resume are there.
>>>
>>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>>> Found 5 items
>>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/vinitha.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/sony.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/ravi.txt
>>> hduser@localhost:~/Ranjini$
>>>
>>> I have to process the folder and the content .
>>>
>>> I need ouput has
>>>
>>> filename   word   occurance
>>> vinitha       java       4
>>> sony          oracle      3
>>>
>>>
>>>
>>> But iam not getting the filename.  Has the input file content are merged
>>> file name is not getting correct .
>>>
>>>
>>> please help in this issue to fix.  I have given by code below
>>>
>>>
>>>  import java.io.IOException;
>>>  import java.util.*;
>>>  import org.apache.hadoop.fs.Path;
>>>  import org.apache.hadoop.conf.*;
>>>  import org.apache.hadoop.io.*;
>>>  import org.apache.hadoop.mapred.*;
>>>  import org.apache.hadoop.util.*;
>>> import java.io.File;
>>> import java.io.FileReader;
>>> import java.io.FileWriter;
>>> import java.io.IOException;
>>> import org.apache.hadoop.fs.Path;
>>> import org.apache.hadoop.conf.Configuration;
>>> import org.apache.hadoop.fs.FileSystem;
>>> import org.apache.hadoop.fs.FileStatus;
>>> import org.apache.hadoop.conf.*;
>>> import org.apache.hadoop.io.*;
>>> import org.apache.hadoop.mapred.*;
>>> import org.apache.hadoop.util.*;
>>> import org.apache.hadoop.mapred.lib.*;
>>>
>>>  public class WordCount {
>>>     public static class Map extends MapReduceBase implements
>>> Mapper<LongWritable, Text, Text, IntWritable> {
>>>      private final static IntWritable one = new IntWritable(1);
>>>       private Text word = new Text();
>>>       public void map(LongWritable key, Text value,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>    FSDataInputStream fs=null;
>>>    FileSystem hdfs = null;
>>>    String line = value.toString();
>>>          int i=0,k=0;
>>>   try{
>>>    Configuration configuration = new Configuration();
>>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>>
>>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>>
>>>    hdfs = FileSystem.get(configuration);
>>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>>    fs=hdfs.open(srcPath);
>>>    BufferedReader br=new BufferedReader(new
>>> InputStreamReader(hdfs.open(srcPath)));
>>>
>>> String[] splited = line.split("\\s+");
>>>     for( i=0;i<splited.length;i++)
>>>  {
>>>      String sp[]=splited[i].split(",");
>>>      for( k=0;k<sp.length;k++)
>>>  {
>>>
>>>    if(!sp[k].isEmpty()){
>>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>>> if((sp[k].equalsIgnoreCase("C"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>>       }
>>>     }
>>> }
>>>  } catch (IOException e) {
>>>     e.printStackTrace();
>>>  }
>>> }
>>> }
>>>     public static class Reduce extends MapReduceBase implements
>>> Reducer<Text, IntWritable, Text, IntWritable> {
>>>       public void reduce(Text key, Iterator<IntWritable> values,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>         int sum = 0;
>>>         while (values.hasNext()) {
>>>           sum += values.next().get();
>>>         }
>>>         output.collect(key, new IntWritable(sum));
>>>       }
>>>     }
>>>     public static void main(String[] args) throws Exception {
>>>
>>>
>>>       JobConf conf = new JobConf(WordCount.class);
>>>       conf.setJobName("wordcount");
>>>       conf.setOutputKeyClass(Text.class);
>>>       conf.setOutputValueClass(IntWritable.class);
>>>       conf.setMapperClass(Map.class);
>>>       conf.setCombinerClass(Reduce.class);
>>>       conf.setReducerClass(Reduce.class);
>>>       conf.setInputFormat(TextInputFormat.class);
>>>       conf.setOutputFormat(TextOutputFormat.class);
>>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>>       JobClient.runJob(conf);
>>>     }
>>>  }
>>>
>>>
>>>
>>> Please help
>>>
>>> Thanks in advance.
>>>
>>> Ranjini
>>>
>>>
>>>
>>> ----------
>>> From: *Stanley Shi* <ss...@gopivotal.com>
>>> Date: Thu, Mar 20, 2014 at 7:39 AM
>>> To: user@hadoop.apache.org
>>>
>>>
>>> You want to do a word count for each file, but the code give you a word
>>> count for all the files, right?
>>>
>>> =====
>>>  word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>> ======
>>> change it to:
>>> word.set("filename"+"    "+tokenizer.nextToken());
>>> output.collect(word,one);
>>>
>>>
>>>
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Thu, Mar 20, 2014 at 10:56 AM
>>> To: ranjini.r@polarisft.com
>>>
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Thu, Mar 20, 2014 at 11:20 AM
>>> To: user@hadoop.apache.org, sshi@gopivotal.com
>>>
>>>
>>> Hi,
>>>
>>> If we give the below code,
>>> =======================
>>>  word.set("filename"+"    "+tokenizer.nextToken());
>>> output.collect(word,one);
>>> ======================
>>>
>>> The output is wrong. because it shows the
>>>
>>>  filename   word   occurance
>>> vinitha       java       4
>>> vinitha         oracle      3
>>> sony           java       4
>>> sony          oracle      3
>>>
>>>
>>> Here vinitha does not have oracle word . Similarlly sony does not have
>>> java has word. File name is merging for  all words.
>>>
>>> I need the output has given below
>>>
>>>  filename   word   occurance
>>>
>>> vinitha       java       4
>>> vinitha         C++    3
>>> sony           ETL     4
>>> sony          oracle      3
>>>
>>>
>>>  Need fileaName along with the word in that particular file only. No
>>> merge should happen.
>>>
>>> Please help me out for this issue.
>>>
>>> Please help.
>>>
>>> Thanks in advance.
>>>
>>> Ranjini
>>>
>>> ----------
>>> From: *Felix Chern* <id...@gmail.com>
>>> Date: Thu, Mar 20, 2014 at 11:25 PM
>>> To: user@hadoop.apache.org
>>> Cc: sshi@gopivotal.com
>>>
>>>
>>>  I've written two blog post of how to get directory context in hadoop
>>> mapper.
>>>
>>>
>>> http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
>>>
>>> http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/
>>>
>>> Cheers,
>>> Felix
>>>
>>> ----------
>>> From: *Stanley Shi* <ss...@gopivotal.com>
>>> Date: Fri, Mar 21, 2014 at 7:02 AM
>>>
>>> To: Ranjini Rathinam <ra...@gmail.com>
>>> Cc: user@hadoop.apache.org
>>>
>>>
>>> Just reviewed the code again, you are not really using map-reduce. you
>>> are reading all files in one map process, this is not a normal map-reduce
>>> job works.
>>>
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>> ----------
>>> From: *Stanley Shi* <ss...@gopivotal.com>
>>> Date: Fri, Mar 21, 2014 at 7:43 AM
>>> To: Ranjini Rathinam <ra...@gmail.com>
>>> Cc: user@hadoop.apache.org
>>>
>>>
>>> Change you mapper to be something like this:
>>>
>>>  public static class TokenizerMapper extends
>>>
>>>       Mapper<Object, Text, Text, IntWritable> {
>>>
>>>
>>>     private final static IntWritable one = new IntWritable(1);
>>>
>>>     private Text word = new Text();
>>>
>>>
>>>     public void map(Object key, Text value, Context context)
>>>
>>>         throws IOException, InterruptedException {
>>>
>>>       Path pp = ((FileSplit) context.getInputSplit()).getPath();
>>>
>>>       StringTokenizer itr = new StringTokenizer(value.toString());
>>>
>>>       log.info("map on string: " + new String(value.getBytes()));
>>>
>>>       while (itr.hasMoreTokens()) {
>>>
>>>         word.set(pp.getName() + " " + itr.nextToken());
>>>
>>>         context.write(word, one);
>>>
>>>       }
>>>
>>>     }
>>>
>>>   }
>>>
>>> Note: add your filtering code here;
>>>
>>> and then when running the command, use you input path as param;
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Fri, Mar 21, 2014 at 9:57 AM
>>> To: ranjini.r@polarisft.com
>>>
>>>
>>>
>>>
>>>  ---------- Forwarded message ----------
>>> From: Stanley Shi <ss...@gopivotal.com>
>>> Date: Fri, Mar 21, 2014 at 7:43 AM
>>> Subject: Re: Need FileName with Content
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Fri, Mar 21, 2014 at 9:58 AM
>>> To: ranjini.r@polarisft.com
>>>
>>>
>>>
>>>
>>>
>>
>

Re: Need FileName with Content

Posted by Shahab Yunus <sh...@gmail.com>.

If this parameter is at the job level (i.e. for the whole run level) then
you can set this value int the Configuration object to pass it on to the
mappers.
http://www.thecloudavenue.com/2011/11/passing-parameters-to-mappers-and.html

Regards,
Shahab


On Fri, Mar 21, 2014 at 7:08 AM, Ranjini Rathinam <ra...@gmail.com>wrote:

> Hi,
>
> Thanks for the great support i have fixed the issue. I have now got
> the output.
>
> But , i have one query ,Possible to give runtime argument for mapper class
>
> like,
>
> Giving the value C,JAVA in runtime.
>
>
>
> * if((sp[k].equalsIgnoreCase("C"))){*
>                                     while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>                                         }
>                                     }
>
> *    if((sp[k].equalsIgnoreCase("JAVA"))){*
>                                          while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>
>  Thanks a lot .
>
> Ranjini
>
>
>
> On Fri, Mar 21, 2014 at 11:45 AM, Ranjini Rathinam <ranjinibecse@gmail.com
> > wrote:
>
>> Hi,
>>
>>
>> Thanks a lot for the great support. I am just learning hadoop and
>> mapreduce.
>>
>> I have used the way you have guided me.
>>
>> But the output is coming without Aggreating
>>
>> vinitha.txt C    1
>> vinitha.txt Java    1
>> vinitha.txt Java    1
>> vinitha.txt Java    1
>> vinitha.txt Java    1
>>
>>
>> *I need the output has *
>>
>>  *vinitha       C    1*
>>
>> *vinitha      Java  4*
>>
>>
>> I have reduce class but still not able to fix it, I am still trying .
>>
>> I have given my code below, Please let me know where i have gone wrong.
>>
>>
>> my code
>>
>>
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.io.LongWritable;
>> import org.apache.hadoop.io.*;
>> import org.apache.hadoop.io.Text;
>> import org.apache.hadoop.mapreduce.InputSplit;
>> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>> import org.apache.hadoop.mapreduce.*;
>> import org.apache.hadoop.mapreduce.Job;
>> import org.apache.hadoop.mapreduce.Mapper;
>> import org.apache.hadoop.mapreduce.Reducer;
>> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>>
>> import java.io.IOException;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.FileStatus;
>> import java.util.*;
>> import java.util.logging.Level;
>> import java.util.logging.Logger;
>>
>>  public class FileCount {
>>     public static class TokenizerMapper extends Mapper<LongWritable,
>> Text, Text, IntWritable> {
>>
>>
>>     private final static IntWritable one = new IntWritable(1);
>>
>>     private Text word = new Text();
>>
>>
>>     public void map(LongWritable key, Text value, Context context) throws
>> IOException, InterruptedException {
>>
>>             FileSplit fileSplit;
>>               InputSplit is = context.getInputSplit();
>>               FileSystem fs = FileSystem.get(context.getConfiguration());
>>               fileSplit = (FileSplit) is;
>>               Path pp = fileSplit.getPath();
>>                     String line=value.toString();
>>                     int i=0;int k=0;
>>                     //Path pp = ((FileSplit)
>> context.getInputSplit()).getPath();
>>
>>                     String[] splited = line.split("\\s+");
>>                         for( i=0;i<splited.length;i++)
>>                             {
>>                                  String sp[]=splited[i].split(",");
>>                          for( k=0;k<sp.length;k++)
>>                             {
>>
>>                                if(!sp[k].isEmpty())
>>                             {
>>
>>                                   StringTokenizer itr = new
>> StringTokenizer(sp[k]);
>>
>>                                   //log.info("map on string: " + new
>> String(value.getBytes()));
>>
>>                                 if((sp[k].equalsIgnoreCase("C"))){
>>                                     while (itr.hasMoreTokens()) {
>>                                            word.set(pp.getName() + " " +
>> itr.nextToken());
>>
>>                                         context.write(word, one);
>>                                         }
>>                                     }
>>                                 if((sp[k].equalsIgnoreCase("JAVA"))){
>>                                          while (itr.hasMoreTokens()) {
>>                                            word.set(pp.getName() + " " +
>> itr.nextToken());
>>
>>                                         context.write(word, one);
>>                                         }
>>                                 }
>>                              }
>>                             }
>>                         }
>>
>>           }
>>
>>   }
>>
>>   public static class Reduce extends Reducer<Text, IntWritable, Text,
>> IntWritable> {
>>
>>     public void reduce(Text key, Iterator<IntWritable> values, Context
>> context) throws IOException, InterruptedException {
>>
>>
>>         int sum = 0;
>>         while (values.hasNext()) {
>>           sum += values.next().get();
>>         }
>>        context.write(key, new IntWritable(sum));
>>
>>       }
>>     }
>>     public static void main(String[] args) throws Exception {
>>             Configuration conf = new Configuration();
>> Job job = new Job(conf, "jobName");
>>
>> String input="/user/hduser/INPUT/";
>> String output="/user/hduser/OUTPUT/";
>> FileInputFormat.setInputPaths(job, input);
>> job.setJarByClass(FileCount.class);
>> job.setMapperClass(TokenizerMapper.class);
>> job.setReducerClass(Reduce.class);
>> job.setCombinerClass(Reduce.class);
>> job.setInputFormatClass(TextInputFormat.class);
>> job.setOutputKeyClass(Text.class);
>> job.setOutputValueClass(IntWritable.class);
>> Path outPath = new Path(output);
>> FileOutputFormat.setOutputPath(job, outPath);
>> FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
>> if (dfs.exists(outPath)) {
>> dfs.delete(outPath, true);
>> }
>>
>>
>> try {
>>
>> job.waitForCompletion(true);
>>
>> } catch (InterruptedException ex) {
>> //Logger.getLogger(FileCOunt.class.getName()).log(Level.SEVERE, null, ex);
>> } catch (ClassNotFoundException ex) {
>> //Logger.getLogger(FileCount.class.getName()).log(Level.SEVERE, null, ex);
>> }
>>
>> }
>>
>> }
>>
>>
>> Thanks in advance for the great help and support to fix the issue .
>>
>> Please help to fix it.
>>
>> Thanks a lot.
>>
>> Regards,
>> Ranjini
>>
>>
>>> Hi,
>>>
>>> I have folder named INPUT.
>>>
>>> Inside INPUT i have 5 resume are there.
>>>
>>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>>> Found 5 items
>>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/vinitha.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/sony.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/ravi.txt
>>> hduser@localhost:~/Ranjini$
>>>
>>> I have to process the folder and the content .
>>>
>>> I need ouput has
>>>
>>> filename   word   occurance
>>> vinitha       java       4
>>> sony          oracle      3
>>>
>>>
>>>
>>> But iam not getting the filename.  Has the input file content are merged
>>> file name is not getting correct .
>>>
>>>
>>> please help in this issue to fix.  I have given by code below
>>>
>>>
>>>  import java.io.IOException;
>>>  import java.util.*;
>>>  import org.apache.hadoop.fs.Path;
>>>  import org.apache.hadoop.conf.*;
>>>  import org.apache.hadoop.io.*;
>>>  import org.apache.hadoop.mapred.*;
>>>  import org.apache.hadoop.util.*;
>>> import java.io.File;
>>> import java.io.FileReader;
>>> import java.io.FileWriter;
>>> import java.io.IOException;
>>> import org.apache.hadoop.fs.Path;
>>> import org.apache.hadoop.conf.Configuration;
>>> import org.apache.hadoop.fs.FileSystem;
>>> import org.apache.hadoop.fs.FileStatus;
>>> import org.apache.hadoop.conf.*;
>>> import org.apache.hadoop.io.*;
>>> import org.apache.hadoop.mapred.*;
>>> import org.apache.hadoop.util.*;
>>> import org.apache.hadoop.mapred.lib.*;
>>>
>>>  public class WordCount {
>>>     public static class Map extends MapReduceBase implements
>>> Mapper<LongWritable, Text, Text, IntWritable> {
>>>      private final static IntWritable one = new IntWritable(1);
>>>       private Text word = new Text();
>>>       public void map(LongWritable key, Text value,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>    FSDataInputStream fs=null;
>>>    FileSystem hdfs = null;
>>>    String line = value.toString();
>>>          int i=0,k=0;
>>>   try{
>>>    Configuration configuration = new Configuration();
>>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>>
>>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>>
>>>    hdfs = FileSystem.get(configuration);
>>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>>    fs=hdfs.open(srcPath);
>>>    BufferedReader br=new BufferedReader(new
>>> InputStreamReader(hdfs.open(srcPath)));
>>>
>>> String[] splited = line.split("\\s+");
>>>     for( i=0;i<splited.length;i++)
>>>  {
>>>      String sp[]=splited[i].split(",");
>>>      for( k=0;k<sp.length;k++)
>>>  {
>>>
>>>    if(!sp[k].isEmpty()){
>>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>>> if((sp[k].equalsIgnoreCase("C"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>>       }
>>>     }
>>> }
>>>  } catch (IOException e) {
>>>     e.printStackTrace();
>>>  }
>>> }
>>> }
>>>     public static class Reduce extends MapReduceBase implements
>>> Reducer<Text, IntWritable, Text, IntWritable> {
>>>       public void reduce(Text key, Iterator<IntWritable> values,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>         int sum = 0;
>>>         while (values.hasNext()) {
>>>           sum += values.next().get();
>>>         }
>>>         output.collect(key, new IntWritable(sum));
>>>       }
>>>     }
>>>     public static void main(String[] args) throws Exception {
>>>
>>>
>>>       JobConf conf = new JobConf(WordCount.class);
>>>       conf.setJobName("wordcount");
>>>       conf.setOutputKeyClass(Text.class);
>>>       conf.setOutputValueClass(IntWritable.class);
>>>       conf.setMapperClass(Map.class);
>>>       conf.setCombinerClass(Reduce.class);
>>>       conf.setReducerClass(Reduce.class);
>>>       conf.setInputFormat(TextInputFormat.class);
>>>       conf.setOutputFormat(TextOutputFormat.class);
>>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>>       JobClient.runJob(conf);
>>>     }
>>>  }
>>>
>>>
>>>
>>> Please help
>>>
>>> Thanks in advance.
>>>
>>> Ranjini
>>>
>>>
>>>
>>> ----------
>>> From: *Stanley Shi* <ss...@gopivotal.com>
>>> Date: Thu, Mar 20, 2014 at 7:39 AM
>>> To: user@hadoop.apache.org
>>>
>>>
>>> You want to do a word count for each file, but the code give you a word
>>> count for all the files, right?
>>>
>>> =====
>>>  word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>> ======
>>> change it to:
>>> word.set("filename"+"    "+tokenizer.nextToken());
>>> output.collect(word,one);
>>>
>>>
>>>
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Thu, Mar 20, 2014 at 10:56 AM
>>> To: ranjini.r@polarisft.com
>>>
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Thu, Mar 20, 2014 at 11:20 AM
>>> To: user@hadoop.apache.org, sshi@gopivotal.com
>>>
>>>
>>> Hi,
>>>
>>> If we give the below code,
>>> =======================
>>>  word.set("filename"+"    "+tokenizer.nextToken());
>>> output.collect(word,one);
>>> ======================
>>>
>>> The output is wrong. because it shows the
>>>
>>>  filename   word   occurance
>>> vinitha       java       4
>>> vinitha         oracle      3
>>> sony           java       4
>>> sony          oracle      3
>>>
>>>
>>> Here vinitha does not have oracle word . Similarlly sony does not have
>>> java has word. File name is merging for  all words.
>>>
>>> I need the output has given below
>>>
>>>  filename   word   occurance
>>>
>>> vinitha       java       4
>>> vinitha         C++    3
>>> sony           ETL     4
>>> sony          oracle      3
>>>
>>>
>>>  Need fileaName along with the word in that particular file only. No
>>> merge should happen.
>>>
>>> Please help me out for this issue.
>>>
>>> Please help.
>>>
>>> Thanks in advance.
>>>
>>> Ranjini
>>>
>>> ----------
>>> From: *Felix Chern* <id...@gmail.com>
>>> Date: Thu, Mar 20, 2014 at 11:25 PM
>>> To: user@hadoop.apache.org
>>> Cc: sshi@gopivotal.com
>>>
>>>
>>>  I've written two blog post of how to get directory context in hadoop
>>> mapper.
>>>
>>>
>>> http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
>>>
>>> http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/
>>>
>>> Cheers,
>>> Felix
>>>
>>> ----------
>>> From: *Stanley Shi* <ss...@gopivotal.com>
>>> Date: Fri, Mar 21, 2014 at 7:02 AM
>>>
>>> To: Ranjini Rathinam <ra...@gmail.com>
>>> Cc: user@hadoop.apache.org
>>>
>>>
>>> Just reviewed the code again, you are not really using map-reduce. you
>>> are reading all files in one map process, this is not a normal map-reduce
>>> job works.
>>>
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>> ----------
>>> From: *Stanley Shi* <ss...@gopivotal.com>
>>> Date: Fri, Mar 21, 2014 at 7:43 AM
>>> To: Ranjini Rathinam <ra...@gmail.com>
>>> Cc: user@hadoop.apache.org
>>>
>>>
>>> Change you mapper to be something like this:
>>>
>>>  public static class TokenizerMapper extends
>>>
>>>       Mapper<Object, Text, Text, IntWritable> {
>>>
>>>
>>>     private final static IntWritable one = new IntWritable(1);
>>>
>>>     private Text word = new Text();
>>>
>>>
>>>     public void map(Object key, Text value, Context context)
>>>
>>>         throws IOException, InterruptedException {
>>>
>>>       Path pp = ((FileSplit) context.getInputSplit()).getPath();
>>>
>>>       StringTokenizer itr = new StringTokenizer(value.toString());
>>>
>>>       log.info("map on string: " + new String(value.getBytes()));
>>>
>>>       while (itr.hasMoreTokens()) {
>>>
>>>         word.set(pp.getName() + " " + itr.nextToken());
>>>
>>>         context.write(word, one);
>>>
>>>       }
>>>
>>>     }
>>>
>>>   }
>>>
>>> Note: add your filtering code here;
>>>
>>> and then when running the command, use you input path as param;
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Fri, Mar 21, 2014 at 9:57 AM
>>> To: ranjini.r@polarisft.com
>>>
>>>
>>>
>>>
>>>  ---------- Forwarded message ----------
>>> From: Stanley Shi <ss...@gopivotal.com>
>>> Date: Fri, Mar 21, 2014 at 7:43 AM
>>> Subject: Re: Need FileName with Content
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Fri, Mar 21, 2014 at 9:58 AM
>>> To: ranjini.r@polarisft.com
>>>
>>>
>>>
>>>
>>>
>>
>

Re: Need FileName with Content

Posted by Shahab Yunus <sh...@gmail.com>.

If this parameter is at the job level (i.e. for the whole run level) then
you can set this value int the Configuration object to pass it on to the
mappers.
http://www.thecloudavenue.com/2011/11/passing-parameters-to-mappers-and.html

Regards,
Shahab


On Fri, Mar 21, 2014 at 7:08 AM, Ranjini Rathinam <ra...@gmail.com>wrote:

> Hi,
>
> Thanks for the great support i have fixed the issue. I have now got
> the output.
>
> But , i have one query ,Possible to give runtime argument for mapper class
>
> like,
>
> Giving the value C,JAVA in runtime.
>
>
>
> * if((sp[k].equalsIgnoreCase("C"))){*
>                                     while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>                                         }
>                                     }
>
> *    if((sp[k].equalsIgnoreCase("JAVA"))){*
>                                          while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>
>  Thanks a lot .
>
> Ranjini
>
>
>
> On Fri, Mar 21, 2014 at 11:45 AM, Ranjini Rathinam <ranjinibecse@gmail.com
> > wrote:
>
>> Hi,
>>
>>
>> Thanks a lot for the great support. I am just learning hadoop and
>> mapreduce.
>>
>> I have used the way you have guided me.
>>
>> But the output is coming without Aggreating
>>
>> vinitha.txt C    1
>> vinitha.txt Java    1
>> vinitha.txt Java    1
>> vinitha.txt Java    1
>> vinitha.txt Java    1
>>
>>
>> *I need the output has *
>>
>>  *vinitha       C    1*
>>
>> *vinitha      Java  4*
>>
>>
>> I have reduce class but still not able to fix it, I am still trying .
>>
>> I have given my code below, Please let me know where i have gone wrong.
>>
>>
>> my code
>>
>>
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.io.LongWritable;
>> import org.apache.hadoop.io.*;
>> import org.apache.hadoop.io.Text;
>> import org.apache.hadoop.mapreduce.InputSplit;
>> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>> import org.apache.hadoop.mapreduce.*;
>> import org.apache.hadoop.mapreduce.Job;
>> import org.apache.hadoop.mapreduce.Mapper;
>> import org.apache.hadoop.mapreduce.Reducer;
>> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>>
>> import java.io.IOException;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.FileStatus;
>> import java.util.*;
>> import java.util.logging.Level;
>> import java.util.logging.Logger;
>>
>>  public class FileCount {
>>     public static class TokenizerMapper extends Mapper<LongWritable,
>> Text, Text, IntWritable> {
>>
>>
>>     private final static IntWritable one = new IntWritable(1);
>>
>>     private Text word = new Text();
>>
>>
>>     public void map(LongWritable key, Text value, Context context) throws
>> IOException, InterruptedException {
>>
>>             FileSplit fileSplit;
>>               InputSplit is = context.getInputSplit();
>>               FileSystem fs = FileSystem.get(context.getConfiguration());
>>               fileSplit = (FileSplit) is;
>>               Path pp = fileSplit.getPath();
>>                     String line=value.toString();
>>                     int i=0;int k=0;
>>                     //Path pp = ((FileSplit)
>> context.getInputSplit()).getPath();
>>
>>                     String[] splited = line.split("\\s+");
>>                         for( i=0;i<splited.length;i++)
>>                             {
>>                                  String sp[]=splited[i].split(",");
>>                          for( k=0;k<sp.length;k++)
>>                             {
>>
>>                                if(!sp[k].isEmpty())
>>                             {
>>
>>                                   StringTokenizer itr = new
>> StringTokenizer(sp[k]);
>>
>>                                   //log.info("map on string: " + new
>> String(value.getBytes()));
>>
>>                                 if((sp[k].equalsIgnoreCase("C"))){
>>                                     while (itr.hasMoreTokens()) {
>>                                            word.set(pp.getName() + " " +
>> itr.nextToken());
>>
>>                                         context.write(word, one);
>>                                         }
>>                                     }
>>                                 if((sp[k].equalsIgnoreCase("JAVA"))){
>>                                          while (itr.hasMoreTokens()) {
>>                                            word.set(pp.getName() + " " +
>> itr.nextToken());
>>
>>                                         context.write(word, one);
>>                                         }
>>                                 }
>>                              }
>>                             }
>>                         }
>>
>>           }
>>
>>   }
>>
>>   public static class Reduce extends Reducer<Text, IntWritable, Text,
>> IntWritable> {
>>
>>     public void reduce(Text key, Iterator<IntWritable> values, Context
>> context) throws IOException, InterruptedException {
>>
>>
>>         int sum = 0;
>>         while (values.hasNext()) {
>>           sum += values.next().get();
>>         }
>>        context.write(key, new IntWritable(sum));
>>
>>       }
>>     }
>>     public static void main(String[] args) throws Exception {
>>             Configuration conf = new Configuration();
>> Job job = new Job(conf, "jobName");
>>
>> String input="/user/hduser/INPUT/";
>> String output="/user/hduser/OUTPUT/";
>> FileInputFormat.setInputPaths(job, input);
>> job.setJarByClass(FileCount.class);
>> job.setMapperClass(TokenizerMapper.class);
>> job.setReducerClass(Reduce.class);
>> job.setCombinerClass(Reduce.class);
>> job.setInputFormatClass(TextInputFormat.class);
>> job.setOutputKeyClass(Text.class);
>> job.setOutputValueClass(IntWritable.class);
>> Path outPath = new Path(output);
>> FileOutputFormat.setOutputPath(job, outPath);
>> FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
>> if (dfs.exists(outPath)) {
>> dfs.delete(outPath, true);
>> }
>>
>>
>> try {
>>
>> job.waitForCompletion(true);
>>
>> } catch (InterruptedException ex) {
>> //Logger.getLogger(FileCOunt.class.getName()).log(Level.SEVERE, null, ex);
>> } catch (ClassNotFoundException ex) {
>> //Logger.getLogger(FileCount.class.getName()).log(Level.SEVERE, null, ex);
>> }
>>
>> }
>>
>> }
>>
>>
>> Thanks in advance for the great help and support to fix the issue .
>>
>> Please help to fix it.
>>
>> Thanks a lot.
>>
>> Regards,
>> Ranjini
>>
>>
>>> Hi,
>>>
>>> I have folder named INPUT.
>>>
>>> Inside INPUT i have 5 resume are there.
>>>
>>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>>> Found 5 items
>>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/vinitha.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/sony.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/ravi.txt
>>> hduser@localhost:~/Ranjini$
>>>
>>> I have to process the folder and the content .
>>>
>>> I need ouput has
>>>
>>> filename   word   occurance
>>> vinitha       java       4
>>> sony          oracle      3
>>>
>>>
>>>
>>> But iam not getting the filename.  Has the input file content are merged
>>> file name is not getting correct .
>>>
>>>
>>> please help in this issue to fix.  I have given by code below
>>>
>>>
>>>  import java.io.IOException;
>>>  import java.util.*;
>>>  import org.apache.hadoop.fs.Path;
>>>  import org.apache.hadoop.conf.*;
>>>  import org.apache.hadoop.io.*;
>>>  import org.apache.hadoop.mapred.*;
>>>  import org.apache.hadoop.util.*;
>>> import java.io.File;
>>> import java.io.FileReader;
>>> import java.io.FileWriter;
>>> import java.io.IOException;
>>> import org.apache.hadoop.fs.Path;
>>> import org.apache.hadoop.conf.Configuration;
>>> import org.apache.hadoop.fs.FileSystem;
>>> import org.apache.hadoop.fs.FileStatus;
>>> import org.apache.hadoop.conf.*;
>>> import org.apache.hadoop.io.*;
>>> import org.apache.hadoop.mapred.*;
>>> import org.apache.hadoop.util.*;
>>> import org.apache.hadoop.mapred.lib.*;
>>>
>>>  public class WordCount {
>>>     public static class Map extends MapReduceBase implements
>>> Mapper<LongWritable, Text, Text, IntWritable> {
>>>      private final static IntWritable one = new IntWritable(1);
>>>       private Text word = new Text();
>>>       public void map(LongWritable key, Text value,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>    FSDataInputStream fs=null;
>>>    FileSystem hdfs = null;
>>>    String line = value.toString();
>>>          int i=0,k=0;
>>>   try{
>>>    Configuration configuration = new Configuration();
>>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>>
>>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>>
>>>    hdfs = FileSystem.get(configuration);
>>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>>    fs=hdfs.open(srcPath);
>>>    BufferedReader br=new BufferedReader(new
>>> InputStreamReader(hdfs.open(srcPath)));
>>>
>>> String[] splited = line.split("\\s+");
>>>     for( i=0;i<splited.length;i++)
>>>  {
>>>      String sp[]=splited[i].split(",");
>>>      for( k=0;k<sp.length;k++)
>>>  {
>>>
>>>    if(!sp[k].isEmpty()){
>>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>>> if((sp[k].equalsIgnoreCase("C"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>>       }
>>>     }
>>> }
>>>  } catch (IOException e) {
>>>     e.printStackTrace();
>>>  }
>>> }
>>> }
>>>     public static class Reduce extends MapReduceBase implements
>>> Reducer<Text, IntWritable, Text, IntWritable> {
>>>       public void reduce(Text key, Iterator<IntWritable> values,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>         int sum = 0;
>>>         while (values.hasNext()) {
>>>           sum += values.next().get();
>>>         }
>>>         output.collect(key, new IntWritable(sum));
>>>       }
>>>     }
>>>     public static void main(String[] args) throws Exception {
>>>
>>>
>>>       JobConf conf = new JobConf(WordCount.class);
>>>       conf.setJobName("wordcount");
>>>       conf.setOutputKeyClass(Text.class);
>>>       conf.setOutputValueClass(IntWritable.class);
>>>       conf.setMapperClass(Map.class);
>>>       conf.setCombinerClass(Reduce.class);
>>>       conf.setReducerClass(Reduce.class);
>>>       conf.setInputFormat(TextInputFormat.class);
>>>       conf.setOutputFormat(TextOutputFormat.class);
>>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>>       JobClient.runJob(conf);
>>>     }
>>>  }
>>>
>>>
>>>
>>> Please help
>>>
>>> Thanks in advance.
>>>
>>> Ranjini
>>>
>>>
>>>
>>> ----------
>>> From: *Stanley Shi* <ss...@gopivotal.com>
>>> Date: Thu, Mar 20, 2014 at 7:39 AM
>>> To: user@hadoop.apache.org
>>>
>>>
>>> You want to do a word count for each file, but the code give you a word
>>> count for all the files, right?
>>>
>>> =====
>>>  word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>> ======
>>> change it to:
>>> word.set("filename"+"    "+tokenizer.nextToken());
>>> output.collect(word,one);
>>>
>>>
>>>
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Thu, Mar 20, 2014 at 10:56 AM
>>> To: ranjini.r@polarisft.com
>>>
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Thu, Mar 20, 2014 at 11:20 AM
>>> To: user@hadoop.apache.org, sshi@gopivotal.com
>>>
>>>
>>> Hi,
>>>
>>> If we give the below code,
>>> =======================
>>>  word.set("filename"+"    "+tokenizer.nextToken());
>>> output.collect(word,one);
>>> ======================
>>>
>>> The output is wrong. because it shows the
>>>
>>>  filename   word   occurance
>>> vinitha       java       4
>>> vinitha         oracle      3
>>> sony           java       4
>>> sony          oracle      3
>>>
>>>
>>> Here vinitha does not have oracle word . Similarlly sony does not have
>>> java has word. File name is merging for  all words.
>>>
>>> I need the output has given below
>>>
>>>  filename   word   occurance
>>>
>>> vinitha       java       4
>>> vinitha         C++    3
>>> sony           ETL     4
>>> sony          oracle      3
>>>
>>>
>>>  Need fileaName along with the word in that particular file only. No
>>> merge should happen.
>>>
>>> Please help me out for this issue.
>>>
>>> Please help.
>>>
>>> Thanks in advance.
>>>
>>> Ranjini
>>>
>>> ----------
>>> From: *Felix Chern* <id...@gmail.com>
>>> Date: Thu, Mar 20, 2014 at 11:25 PM
>>> To: user@hadoop.apache.org
>>> Cc: sshi@gopivotal.com
>>>
>>>
>>>  I've written two blog post of how to get directory context in hadoop
>>> mapper.
>>>
>>>
>>> http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
>>>
>>> http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/
>>>
>>> Cheers,
>>> Felix
>>>
>>> ----------
>>> From: *Stanley Shi* <ss...@gopivotal.com>
>>> Date: Fri, Mar 21, 2014 at 7:02 AM
>>>
>>> To: Ranjini Rathinam <ra...@gmail.com>
>>> Cc: user@hadoop.apache.org
>>>
>>>
>>> Just reviewed the code again, you are not really using map-reduce. you
>>> are reading all files in one map process, this is not a normal map-reduce
>>> job works.
>>>
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>> ----------
>>> From: *Stanley Shi* <ss...@gopivotal.com>
>>> Date: Fri, Mar 21, 2014 at 7:43 AM
>>> To: Ranjini Rathinam <ra...@gmail.com>
>>> Cc: user@hadoop.apache.org
>>>
>>>
>>> Change you mapper to be something like this:
>>>
>>>  public static class TokenizerMapper extends
>>>
>>>       Mapper<Object, Text, Text, IntWritable> {
>>>
>>>
>>>     private final static IntWritable one = new IntWritable(1);
>>>
>>>     private Text word = new Text();
>>>
>>>
>>>     public void map(Object key, Text value, Context context)
>>>
>>>         throws IOException, InterruptedException {
>>>
>>>       Path pp = ((FileSplit) context.getInputSplit()).getPath();
>>>
>>>       StringTokenizer itr = new StringTokenizer(value.toString());
>>>
>>>       log.info("map on string: " + new String(value.getBytes()));
>>>
>>>       while (itr.hasMoreTokens()) {
>>>
>>>         word.set(pp.getName() + " " + itr.nextToken());
>>>
>>>         context.write(word, one);
>>>
>>>       }
>>>
>>>     }
>>>
>>>   }
>>>
>>> Note: add your filtering code here;
>>>
>>> and then when running the command, use you input path as param;
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Fri, Mar 21, 2014 at 9:57 AM
>>> To: ranjini.r@polarisft.com
>>>
>>>
>>>
>>>
>>>  ---------- Forwarded message ----------
>>> From: Stanley Shi <ss...@gopivotal.com>
>>> Date: Fri, Mar 21, 2014 at 7:43 AM
>>> Subject: Re: Need FileName with Content
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Fri, Mar 21, 2014 at 9:58 AM
>>> To: ranjini.r@polarisft.com
>>>
>>>
>>>
>>>
>>>
>>
>

Re: Need FileName with Content

Posted by Shahab Yunus <sh...@gmail.com>.

If this parameter is at the job level (i.e. for the whole run level) then
you can set this value int the Configuration object to pass it on to the
mappers.
http://www.thecloudavenue.com/2011/11/passing-parameters-to-mappers-and.html

Regards,
Shahab


On Fri, Mar 21, 2014 at 7:08 AM, Ranjini Rathinam <ra...@gmail.com>wrote:

> Hi,
>
> Thanks for the great support i have fixed the issue. I have now got
> the output.
>
> But , i have one query ,Possible to give runtime argument for mapper class
>
> like,
>
> Giving the value C,JAVA in runtime.
>
>
>
> * if((sp[k].equalsIgnoreCase("C"))){*
>                                     while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>                                         }
>                                     }
>
> *    if((sp[k].equalsIgnoreCase("JAVA"))){*
>                                          while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>
>  Thanks a lot .
>
> Ranjini
>
>
>
> On Fri, Mar 21, 2014 at 11:45 AM, Ranjini Rathinam <ranjinibecse@gmail.com
> > wrote:
>
>> Hi,
>>
>>
>> Thanks a lot for the great support. I am just learning hadoop and
>> mapreduce.
>>
>> I have used the way you have guided me.
>>
>> But the output is coming without Aggreating
>>
>> vinitha.txt C    1
>> vinitha.txt Java    1
>> vinitha.txt Java    1
>> vinitha.txt Java    1
>> vinitha.txt Java    1
>>
>>
>> *I need the output has *
>>
>>  *vinitha       C    1*
>>
>> *vinitha      Java  4*
>>
>>
>> I have reduce class but still not able to fix it, I am still trying .
>>
>> I have given my code below, Please let me know where i have gone wrong.
>>
>>
>> my code
>>
>>
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.io.LongWritable;
>> import org.apache.hadoop.io.*;
>> import org.apache.hadoop.io.Text;
>> import org.apache.hadoop.mapreduce.InputSplit;
>> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>> import org.apache.hadoop.mapreduce.*;
>> import org.apache.hadoop.mapreduce.Job;
>> import org.apache.hadoop.mapreduce.Mapper;
>> import org.apache.hadoop.mapreduce.Reducer;
>> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>>
>> import java.io.IOException;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.FileStatus;
>> import java.util.*;
>> import java.util.logging.Level;
>> import java.util.logging.Logger;
>>
>>  public class FileCount {
>>     public static class TokenizerMapper extends Mapper<LongWritable,
>> Text, Text, IntWritable> {
>>
>>
>>     private final static IntWritable one = new IntWritable(1);
>>
>>     private Text word = new Text();
>>
>>
>>     public void map(LongWritable key, Text value, Context context) throws
>> IOException, InterruptedException {
>>
>>             FileSplit fileSplit;
>>               InputSplit is = context.getInputSplit();
>>               FileSystem fs = FileSystem.get(context.getConfiguration());
>>               fileSplit = (FileSplit) is;
>>               Path pp = fileSplit.getPath();
>>                     String line=value.toString();
>>                     int i=0;int k=0;
>>                     //Path pp = ((FileSplit)
>> context.getInputSplit()).getPath();
>>
>>                     String[] splited = line.split("\\s+");
>>                         for( i=0;i<splited.length;i++)
>>                             {
>>                                  String sp[]=splited[i].split(",");
>>                          for( k=0;k<sp.length;k++)
>>                             {
>>
>>                                if(!sp[k].isEmpty())
>>                             {
>>
>>                                   StringTokenizer itr = new
>> StringTokenizer(sp[k]);
>>
>>                                   //log.info("map on string: " + new
>> String(value.getBytes()));
>>
>>                                 if((sp[k].equalsIgnoreCase("C"))){
>>                                     while (itr.hasMoreTokens()) {
>>                                            word.set(pp.getName() + " " +
>> itr.nextToken());
>>
>>                                         context.write(word, one);
>>                                         }
>>                                     }
>>                                 if((sp[k].equalsIgnoreCase("JAVA"))){
>>                                          while (itr.hasMoreTokens()) {
>>                                            word.set(pp.getName() + " " +
>> itr.nextToken());
>>
>>                                         context.write(word, one);
>>                                         }
>>                                 }
>>                              }
>>                             }
>>                         }
>>
>>           }
>>
>>   }
>>
>>   public static class Reduce extends Reducer<Text, IntWritable, Text,
>> IntWritable> {
>>
>>     public void reduce(Text key, Iterator<IntWritable> values, Context
>> context) throws IOException, InterruptedException {
>>
>>
>>         int sum = 0;
>>         while (values.hasNext()) {
>>           sum += values.next().get();
>>         }
>>        context.write(key, new IntWritable(sum));
>>
>>       }
>>     }
>>     public static void main(String[] args) throws Exception {
>>             Configuration conf = new Configuration();
>> Job job = new Job(conf, "jobName");
>>
>> String input="/user/hduser/INPUT/";
>> String output="/user/hduser/OUTPUT/";
>> FileInputFormat.setInputPaths(job, input);
>> job.setJarByClass(FileCount.class);
>> job.setMapperClass(TokenizerMapper.class);
>> job.setReducerClass(Reduce.class);
>> job.setCombinerClass(Reduce.class);
>> job.setInputFormatClass(TextInputFormat.class);
>> job.setOutputKeyClass(Text.class);
>> job.setOutputValueClass(IntWritable.class);
>> Path outPath = new Path(output);
>> FileOutputFormat.setOutputPath(job, outPath);
>> FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
>> if (dfs.exists(outPath)) {
>> dfs.delete(outPath, true);
>> }
>>
>>
>> try {
>>
>> job.waitForCompletion(true);
>>
>> } catch (InterruptedException ex) {
>> //Logger.getLogger(FileCOunt.class.getName()).log(Level.SEVERE, null, ex);
>> } catch (ClassNotFoundException ex) {
>> //Logger.getLogger(FileCount.class.getName()).log(Level.SEVERE, null, ex);
>> }
>>
>> }
>>
>> }
>>
>>
>> Thanks in advance for the great help and support to fix the issue .
>>
>> Please help to fix it.
>>
>> Thanks a lot.
>>
>> Regards,
>> Ranjini
>>
>>
>>> Hi,
>>>
>>> I have folder named INPUT.
>>>
>>> Inside INPUT i have 5 resume are there.
>>>
>>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>>> Found 5 items
>>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/vinitha.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/sony.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/ravi.txt
>>> hduser@localhost:~/Ranjini$
>>>
>>> I have to process the folder and the content .
>>>
>>> I need ouput has
>>>
>>> filename   word   occurance
>>> vinitha       java       4
>>> sony          oracle      3
>>>
>>>
>>>
>>> But iam not getting the filename.  Has the input file content are merged
>>> file name is not getting correct .
>>>
>>>
>>> please help in this issue to fix.  I have given by code below
>>>
>>>
>>>  import java.io.IOException;
>>>  import java.util.*;
>>>  import org.apache.hadoop.fs.Path;
>>>  import org.apache.hadoop.conf.*;
>>>  import org.apache.hadoop.io.*;
>>>  import org.apache.hadoop.mapred.*;
>>>  import org.apache.hadoop.util.*;
>>> import java.io.File;
>>> import java.io.FileReader;
>>> import java.io.FileWriter;
>>> import java.io.IOException;
>>> import org.apache.hadoop.fs.Path;
>>> import org.apache.hadoop.conf.Configuration;
>>> import org.apache.hadoop.fs.FileSystem;
>>> import org.apache.hadoop.fs.FileStatus;
>>> import org.apache.hadoop.conf.*;
>>> import org.apache.hadoop.io.*;
>>> import org.apache.hadoop.mapred.*;
>>> import org.apache.hadoop.util.*;
>>> import org.apache.hadoop.mapred.lib.*;
>>>
>>>  public class WordCount {
>>>     public static class Map extends MapReduceBase implements
>>> Mapper<LongWritable, Text, Text, IntWritable> {
>>>      private final static IntWritable one = new IntWritable(1);
>>>       private Text word = new Text();
>>>       public void map(LongWritable key, Text value,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>    FSDataInputStream fs=null;
>>>    FileSystem hdfs = null;
>>>    String line = value.toString();
>>>          int i=0,k=0;
>>>   try{
>>>    Configuration configuration = new Configuration();
>>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>>
>>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>>
>>>    hdfs = FileSystem.get(configuration);
>>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>>    fs=hdfs.open(srcPath);
>>>    BufferedReader br=new BufferedReader(new
>>> InputStreamReader(hdfs.open(srcPath)));
>>>
>>> String[] splited = line.split("\\s+");
>>>     for( i=0;i<splited.length;i++)
>>>  {
>>>      String sp[]=splited[i].split(",");
>>>      for( k=0;k<sp.length;k++)
>>>  {
>>>
>>>    if(!sp[k].isEmpty()){
>>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>>> if((sp[k].equalsIgnoreCase("C"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>>       }
>>>     }
>>> }
>>>  } catch (IOException e) {
>>>     e.printStackTrace();
>>>  }
>>> }
>>> }
>>>     public static class Reduce extends MapReduceBase implements
>>> Reducer<Text, IntWritable, Text, IntWritable> {
>>>       public void reduce(Text key, Iterator<IntWritable> values,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>         int sum = 0;
>>>         while (values.hasNext()) {
>>>           sum += values.next().get();
>>>         }
>>>         output.collect(key, new IntWritable(sum));
>>>       }
>>>     }
>>>     public static void main(String[] args) throws Exception {
>>>
>>>
>>>       JobConf conf = new JobConf(WordCount.class);
>>>       conf.setJobName("wordcount");
>>>       conf.setOutputKeyClass(Text.class);
>>>       conf.setOutputValueClass(IntWritable.class);
>>>       conf.setMapperClass(Map.class);
>>>       conf.setCombinerClass(Reduce.class);
>>>       conf.setReducerClass(Reduce.class);
>>>       conf.setInputFormat(TextInputFormat.class);
>>>       conf.setOutputFormat(TextOutputFormat.class);
>>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>>       JobClient.runJob(conf);
>>>     }
>>>  }
>>>
>>>
>>>
>>> Please help
>>>
>>> Thanks in advance.
>>>
>>> Ranjini
>>>
>>>
>>>
>>> ----------
>>> From: *Stanley Shi* <ss...@gopivotal.com>
>>> Date: Thu, Mar 20, 2014 at 7:39 AM
>>> To: user@hadoop.apache.org
>>>
>>>
>>> You want to do a word count for each file, but the code give you a word
>>> count for all the files, right?
>>>
>>> =====
>>>  word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>> ======
>>> change it to:
>>> word.set("filename"+"    "+tokenizer.nextToken());
>>> output.collect(word,one);
>>>
>>>
>>>
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Thu, Mar 20, 2014 at 10:56 AM
>>> To: ranjini.r@polarisft.com
>>>
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Thu, Mar 20, 2014 at 11:20 AM
>>> To: user@hadoop.apache.org, sshi@gopivotal.com
>>>
>>>
>>> Hi,
>>>
>>> If we give the below code,
>>> =======================
>>>  word.set("filename"+"    "+tokenizer.nextToken());
>>> output.collect(word,one);
>>> ======================
>>>
>>> The output is wrong. because it shows the
>>>
>>>  filename   word   occurance
>>> vinitha       java       4
>>> vinitha         oracle      3
>>> sony           java       4
>>> sony          oracle      3
>>>
>>>
>>> Here vinitha does not have oracle word . Similarlly sony does not have
>>> java has word. File name is merging for  all words.
>>>
>>> I need the output has given below
>>>
>>>  filename   word   occurance
>>>
>>> vinitha       java       4
>>> vinitha         C++    3
>>> sony           ETL     4
>>> sony          oracle      3
>>>
>>>
>>>  Need fileaName along with the word in that particular file only. No
>>> merge should happen.
>>>
>>> Please help me out for this issue.
>>>
>>> Please help.
>>>
>>> Thanks in advance.
>>>
>>> Ranjini
>>>
>>> ----------
>>> From: *Felix Chern* <id...@gmail.com>
>>> Date: Thu, Mar 20, 2014 at 11:25 PM
>>> To: user@hadoop.apache.org
>>> Cc: sshi@gopivotal.com
>>>
>>>
>>>  I've written two blog post of how to get directory context in hadoop
>>> mapper.
>>>
>>>
>>> http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
>>>
>>> http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/
>>>
>>> Cheers,
>>> Felix
>>>
>>> ----------
>>> From: *Stanley Shi* <ss...@gopivotal.com>
>>> Date: Fri, Mar 21, 2014 at 7:02 AM
>>>
>>> To: Ranjini Rathinam <ra...@gmail.com>
>>> Cc: user@hadoop.apache.org
>>>
>>>
>>> Just reviewed the code again, you are not really using map-reduce. you
>>> are reading all files in one map process, this is not a normal map-reduce
>>> job works.
>>>
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>> ----------
>>> From: *Stanley Shi* <ss...@gopivotal.com>
>>> Date: Fri, Mar 21, 2014 at 7:43 AM
>>> To: Ranjini Rathinam <ra...@gmail.com>
>>> Cc: user@hadoop.apache.org
>>>
>>>
>>> Change you mapper to be something like this:
>>>
>>>  public static class TokenizerMapper extends
>>>
>>>       Mapper<Object, Text, Text, IntWritable> {
>>>
>>>
>>>     private final static IntWritable one = new IntWritable(1);
>>>
>>>     private Text word = new Text();
>>>
>>>
>>>     public void map(Object key, Text value, Context context)
>>>
>>>         throws IOException, InterruptedException {
>>>
>>>       Path pp = ((FileSplit) context.getInputSplit()).getPath();
>>>
>>>       StringTokenizer itr = new StringTokenizer(value.toString());
>>>
>>>       log.info("map on string: " + new String(value.getBytes()));
>>>
>>>       while (itr.hasMoreTokens()) {
>>>
>>>         word.set(pp.getName() + " " + itr.nextToken());
>>>
>>>         context.write(word, one);
>>>
>>>       }
>>>
>>>     }
>>>
>>>   }
>>>
>>> Note: add your filtering code here;
>>>
>>> and then when running the command, use you input path as param;
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Fri, Mar 21, 2014 at 9:57 AM
>>> To: ranjini.r@polarisft.com
>>>
>>>
>>>
>>>
>>>  ---------- Forwarded message ----------
>>> From: Stanley Shi <ss...@gopivotal.com>
>>> Date: Fri, Mar 21, 2014 at 7:43 AM
>>> Subject: Re: Need FileName with Content
>>>
>>>
>>> ----------
>>> From: *Ranjini Rathinam* <ra...@gmail.com>
>>> Date: Fri, Mar 21, 2014 at 9:58 AM
>>> To: ranjini.r@polarisft.com
>>>
>>>
>>>
>>>
>>>
>>
>

Re: Need FileName with Content

Posted by Ranjini Rathinam <ra...@gmail.com>.

Hi,

Thanks for the great support i have fixed the issue. I have now got
the output.

But , i have one query ,Possible to give runtime argument for mapper class

like,

Giving the value C,JAVA in runtime.



* if((sp[k].equalsIgnoreCase("C"))){*
                                    while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                    }

*    if((sp[k].equalsIgnoreCase("JAVA"))){*
                                         while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);

 Thanks a lot .

Ranjini



On Fri, Mar 21, 2014 at 11:45 AM, Ranjini Rathinam
<ra...@gmail.com>wrote:

> Hi,
>
>
> Thanks a lot for the great support. I am just learning hadoop and
> mapreduce.
>
> I have used the way you have guided me.
>
> But the output is coming without Aggreating
>
> vinitha.txt C    1
> vinitha.txt Java    1
> vinitha.txt Java    1
> vinitha.txt Java    1
> vinitha.txt Java    1
>
>
> *I need the output has *
>
>  *vinitha       C    1*
>
> *vinitha      Java  4*
>
>
> I have reduce class but still not able to fix it, I am still trying .
>
> I have given my code below, Please let me know where i have gone wrong.
>
>
> my code
>
>
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.io.LongWritable;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.io.Text;
> import org.apache.hadoop.mapreduce.InputSplit;
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
> import org.apache.hadoop.mapreduce.*;
> import org.apache.hadoop.mapreduce.Job;
> import org.apache.hadoop.mapreduce.Mapper;
> import org.apache.hadoop.mapreduce.Reducer;
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import java.util.*;
> import java.util.logging.Level;
> import java.util.logging.Logger;
>
>  public class FileCount {
>     public static class TokenizerMapper extends Mapper<LongWritable, Text,
> Text, IntWritable> {
>
>
>     private final static IntWritable one = new IntWritable(1);
>
>     private Text word = new Text();
>
>
>     public void map(LongWritable key, Text value, Context context) throws
> IOException, InterruptedException {
>
>             FileSplit fileSplit;
>               InputSplit is = context.getInputSplit();
>               FileSystem fs = FileSystem.get(context.getConfiguration());
>               fileSplit = (FileSplit) is;
>               Path pp = fileSplit.getPath();
>                     String line=value.toString();
>                     int i=0;int k=0;
>                     //Path pp = ((FileSplit)
> context.getInputSplit()).getPath();
>
>                     String[] splited = line.split("\\s+");
>                         for( i=0;i<splited.length;i++)
>                             {
>                                  String sp[]=splited[i].split(",");
>                          for( k=0;k<sp.length;k++)
>                             {
>
>                                if(!sp[k].isEmpty())
>                             {
>
>                                   StringTokenizer itr = new
> StringTokenizer(sp[k]);
>
>                                   //log.info("map on string: " + new
> String(value.getBytes()));
>
>                                 if((sp[k].equalsIgnoreCase("C"))){
>                                     while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>                                         }
>                                     }
>                                 if((sp[k].equalsIgnoreCase("JAVA"))){
>                                          while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>                                         }
>                                 }
>                              }
>                             }
>                         }
>
>           }
>
>   }
>
>   public static class Reduce extends Reducer<Text, IntWritable, Text,
> IntWritable> {
>
>     public void reduce(Text key, Iterator<IntWritable> values, Context
> context) throws IOException, InterruptedException {
>
>
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>        context.write(key, new IntWritable(sum));
>
>       }
>     }
>     public static void main(String[] args) throws Exception {
>             Configuration conf = new Configuration();
> Job job = new Job(conf, "jobName");
>
> String input="/user/hduser/INPUT/";
> String output="/user/hduser/OUTPUT/";
> FileInputFormat.setInputPaths(job, input);
> job.setJarByClass(FileCount.class);
> job.setMapperClass(TokenizerMapper.class);
> job.setReducerClass(Reduce.class);
> job.setCombinerClass(Reduce.class);
> job.setInputFormatClass(TextInputFormat.class);
> job.setOutputKeyClass(Text.class);
> job.setOutputValueClass(IntWritable.class);
> Path outPath = new Path(output);
> FileOutputFormat.setOutputPath(job, outPath);
> FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
> if (dfs.exists(outPath)) {
> dfs.delete(outPath, true);
> }
>
>
> try {
>
> job.waitForCompletion(true);
>
> } catch (InterruptedException ex) {
> //Logger.getLogger(FileCOunt.class.getName()).log(Level.SEVERE, null, ex);
> } catch (ClassNotFoundException ex) {
> //Logger.getLogger(FileCount.class.getName()).log(Level.SEVERE, null, ex);
> }
>
> }
>
> }
>
>
> Thanks in advance for the great help and support to fix the issue .
>
> Please help to fix it.
>
> Thanks a lot.
>
> Regards,
> Ranjini
>
>
>> Hi,
>>
>> I have folder named INPUT.
>>
>> Inside INPUT i have 5 resume are there.
>>
>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>> Found 5 items
>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/vinitha.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/sony.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/ravi.txt
>> hduser@localhost:~/Ranjini$
>>
>> I have to process the folder and the content .
>>
>> I need ouput has
>>
>> filename   word   occurance
>> vinitha       java       4
>> sony          oracle      3
>>
>>
>>
>> But iam not getting the filename.  Has the input file content are merged
>> file name is not getting correct .
>>
>>
>> please help in this issue to fix.  I have given by code below
>>
>>
>>  import java.io.IOException;
>>  import java.util.*;
>>  import org.apache.hadoop.fs.Path;
>>  import org.apache.hadoop.conf.*;
>>  import org.apache.hadoop.io.*;
>>  import org.apache.hadoop.mapred.*;
>>  import org.apache.hadoop.util.*;
>> import java.io.File;
>> import java.io.FileReader;
>> import java.io.FileWriter;
>> import java.io.IOException;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.FileStatus;
>> import org.apache.hadoop.conf.*;
>> import org.apache.hadoop.io.*;
>> import org.apache.hadoop.mapred.*;
>> import org.apache.hadoop.util.*;
>> import org.apache.hadoop.mapred.lib.*;
>>
>>  public class WordCount {
>>     public static class Map extends MapReduceBase implements
>> Mapper<LongWritable, Text, Text, IntWritable> {
>>      private final static IntWritable one = new IntWritable(1);
>>       private Text word = new Text();
>>       public void map(LongWritable key, Text value, OutputCollector<Text,
>> IntWritable> output, Reporter reporter) throws IOException {
>>    FSDataInputStream fs=null;
>>    FileSystem hdfs = null;
>>    String line = value.toString();
>>          int i=0,k=0;
>>   try{
>>    Configuration configuration = new Configuration();
>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>
>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>
>>    hdfs = FileSystem.get(configuration);
>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>    fs=hdfs.open(srcPath);
>>    BufferedReader br=new BufferedReader(new
>> InputStreamReader(hdfs.open(srcPath)));
>>
>> String[] splited = line.split("\\s+");
>>     for( i=0;i<splited.length;i++)
>>  {
>>      String sp[]=splited[i].split(",");
>>      for( k=0;k<sp.length;k++)
>>  {
>>
>>    if(!sp[k].isEmpty()){
>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>> if((sp[k].equalsIgnoreCase("C"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>>       }
>>     }
>> }
>>  } catch (IOException e) {
>>     e.printStackTrace();
>>  }
>> }
>> }
>>     public static class Reduce extends MapReduceBase implements
>> Reducer<Text, IntWritable, Text, IntWritable> {
>>       public void reduce(Text key, Iterator<IntWritable> values,
>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>> IOException {
>>         int sum = 0;
>>         while (values.hasNext()) {
>>           sum += values.next().get();
>>         }
>>         output.collect(key, new IntWritable(sum));
>>       }
>>     }
>>     public static void main(String[] args) throws Exception {
>>
>>
>>       JobConf conf = new JobConf(WordCount.class);
>>       conf.setJobName("wordcount");
>>       conf.setOutputKeyClass(Text.class);
>>       conf.setOutputValueClass(IntWritable.class);
>>       conf.setMapperClass(Map.class);
>>       conf.setCombinerClass(Reduce.class);
>>       conf.setReducerClass(Reduce.class);
>>       conf.setInputFormat(TextInputFormat.class);
>>       conf.setOutputFormat(TextOutputFormat.class);
>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>       JobClient.runJob(conf);
>>     }
>>  }
>>
>>
>>
>> Please help
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>>
>>
>> ----------
>> From: *Stanley Shi* <ss...@gopivotal.com>
>> Date: Thu, Mar 20, 2014 at 7:39 AM
>> To: user@hadoop.apache.org
>>
>>
>> You want to do a word count for each file, but the code give you a word
>> count for all the files, right?
>>
>> =====
>>  word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>> ======
>> change it to:
>> word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>>
>>
>>
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Thu, Mar 20, 2014 at 10:56 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Thu, Mar 20, 2014 at 11:20 AM
>> To: user@hadoop.apache.org, sshi@gopivotal.com
>>
>>
>> Hi,
>>
>> If we give the below code,
>> =======================
>>  word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>> ======================
>>
>> The output is wrong. because it shows the
>>
>>  filename   word   occurance
>> vinitha       java       4
>> vinitha         oracle      3
>> sony           java       4
>> sony          oracle      3
>>
>>
>> Here vinitha does not have oracle word . Similarlly sony does not have
>> java has word. File name is merging for  all words.
>>
>> I need the output has given below
>>
>>  filename   word   occurance
>>
>> vinitha       java       4
>> vinitha         C++    3
>> sony           ETL     4
>> sony          oracle      3
>>
>>
>>  Need fileaName along with the word in that particular file only. No
>> merge should happen.
>>
>> Please help me out for this issue.
>>
>> Please help.
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>> ----------
>> From: *Felix Chern* <id...@gmail.com>
>> Date: Thu, Mar 20, 2014 at 11:25 PM
>> To: user@hadoop.apache.org
>> Cc: sshi@gopivotal.com
>>
>>
>>  I've written two blog post of how to get directory context in hadoop
>> mapper.
>>
>>
>> http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
>>
>> http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/
>>
>> Cheers,
>> Felix
>>
>> ----------
>> From: *Stanley Shi* <ss...@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:02 AM
>>
>> To: Ranjini Rathinam <ra...@gmail.com>
>> Cc: user@hadoop.apache.org
>>
>>
>> Just reviewed the code again, you are not really using map-reduce. you
>> are reading all files in one map process, this is not a normal map-reduce
>> job works.
>>
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Stanley Shi* <ss...@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:43 AM
>> To: Ranjini Rathinam <ra...@gmail.com>
>> Cc: user@hadoop.apache.org
>>
>>
>> Change you mapper to be something like this:
>>
>>  public static class TokenizerMapper extends
>>
>>       Mapper<Object, Text, Text, IntWritable> {
>>
>>
>>     private final static IntWritable one = new IntWritable(1);
>>
>>     private Text word = new Text();
>>
>>
>>     public void map(Object key, Text value, Context context)
>>
>>         throws IOException, InterruptedException {
>>
>>       Path pp = ((FileSplit) context.getInputSplit()).getPath();
>>
>>       StringTokenizer itr = new StringTokenizer(value.toString());
>>
>>       log.info("map on string: " + new String(value.getBytes()));
>>
>>       while (itr.hasMoreTokens()) {
>>
>>         word.set(pp.getName() + " " + itr.nextToken());
>>
>>         context.write(word, one);
>>
>>       }
>>
>>     }
>>
>>   }
>>
>> Note: add your filtering code here;
>>
>> and then when running the command, use you input path as param;
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Fri, Mar 21, 2014 at 9:57 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>>
>>  ---------- Forwarded message ----------
>> From: Stanley Shi <ss...@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:43 AM
>> Subject: Re: Need FileName with Content
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Fri, Mar 21, 2014 at 9:58 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>>
>>
>

Re: Need FileName with Content

Posted by Ranjini Rathinam <ra...@gmail.com>.

Hi,

Thanks for the great support i have fixed the issue. I have now got
the output.

But , i have one query ,Possible to give runtime argument for mapper class

like,

Giving the value C,JAVA in runtime.



* if((sp[k].equalsIgnoreCase("C"))){*
                                    while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                    }

*    if((sp[k].equalsIgnoreCase("JAVA"))){*
                                         while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);

 Thanks a lot .

Ranjini



On Fri, Mar 21, 2014 at 11:45 AM, Ranjini Rathinam
<ra...@gmail.com>wrote:

> Hi,
>
>
> Thanks a lot for the great support. I am just learning hadoop and
> mapreduce.
>
> I have used the way you have guided me.
>
> But the output is coming without Aggreating
>
> vinitha.txt C    1
> vinitha.txt Java    1
> vinitha.txt Java    1
> vinitha.txt Java    1
> vinitha.txt Java    1
>
>
> *I need the output has *
>
>  *vinitha       C    1*
>
> *vinitha      Java  4*
>
>
> I have reduce class but still not able to fix it, I am still trying .
>
> I have given my code below, Please let me know where i have gone wrong.
>
>
> my code
>
>
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.io.LongWritable;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.io.Text;
> import org.apache.hadoop.mapreduce.InputSplit;
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
> import org.apache.hadoop.mapreduce.*;
> import org.apache.hadoop.mapreduce.Job;
> import org.apache.hadoop.mapreduce.Mapper;
> import org.apache.hadoop.mapreduce.Reducer;
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import java.util.*;
> import java.util.logging.Level;
> import java.util.logging.Logger;
>
>  public class FileCount {
>     public static class TokenizerMapper extends Mapper<LongWritable, Text,
> Text, IntWritable> {
>
>
>     private final static IntWritable one = new IntWritable(1);
>
>     private Text word = new Text();
>
>
>     public void map(LongWritable key, Text value, Context context) throws
> IOException, InterruptedException {
>
>             FileSplit fileSplit;
>               InputSplit is = context.getInputSplit();
>               FileSystem fs = FileSystem.get(context.getConfiguration());
>               fileSplit = (FileSplit) is;
>               Path pp = fileSplit.getPath();
>                     String line=value.toString();
>                     int i=0;int k=0;
>                     //Path pp = ((FileSplit)
> context.getInputSplit()).getPath();
>
>                     String[] splited = line.split("\\s+");
>                         for( i=0;i<splited.length;i++)
>                             {
>                                  String sp[]=splited[i].split(",");
>                          for( k=0;k<sp.length;k++)
>                             {
>
>                                if(!sp[k].isEmpty())
>                             {
>
>                                   StringTokenizer itr = new
> StringTokenizer(sp[k]);
>
>                                   //log.info("map on string: " + new
> String(value.getBytes()));
>
>                                 if((sp[k].equalsIgnoreCase("C"))){
>                                     while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>                                         }
>                                     }
>                                 if((sp[k].equalsIgnoreCase("JAVA"))){
>                                          while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>                                         }
>                                 }
>                              }
>                             }
>                         }
>
>           }
>
>   }
>
>   public static class Reduce extends Reducer<Text, IntWritable, Text,
> IntWritable> {
>
>     public void reduce(Text key, Iterator<IntWritable> values, Context
> context) throws IOException, InterruptedException {
>
>
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>        context.write(key, new IntWritable(sum));
>
>       }
>     }
>     public static void main(String[] args) throws Exception {
>             Configuration conf = new Configuration();
> Job job = new Job(conf, "jobName");
>
> String input="/user/hduser/INPUT/";
> String output="/user/hduser/OUTPUT/";
> FileInputFormat.setInputPaths(job, input);
> job.setJarByClass(FileCount.class);
> job.setMapperClass(TokenizerMapper.class);
> job.setReducerClass(Reduce.class);
> job.setCombinerClass(Reduce.class);
> job.setInputFormatClass(TextInputFormat.class);
> job.setOutputKeyClass(Text.class);
> job.setOutputValueClass(IntWritable.class);
> Path outPath = new Path(output);
> FileOutputFormat.setOutputPath(job, outPath);
> FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
> if (dfs.exists(outPath)) {
> dfs.delete(outPath, true);
> }
>
>
> try {
>
> job.waitForCompletion(true);
>
> } catch (InterruptedException ex) {
> //Logger.getLogger(FileCOunt.class.getName()).log(Level.SEVERE, null, ex);
> } catch (ClassNotFoundException ex) {
> //Logger.getLogger(FileCount.class.getName()).log(Level.SEVERE, null, ex);
> }
>
> }
>
> }
>
>
> Thanks in advance for the great help and support to fix the issue .
>
> Please help to fix it.
>
> Thanks a lot.
>
> Regards,
> Ranjini
>
>
>> Hi,
>>
>> I have folder named INPUT.
>>
>> Inside INPUT i have 5 resume are there.
>>
>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>> Found 5 items
>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/vinitha.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/sony.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/ravi.txt
>> hduser@localhost:~/Ranjini$
>>
>> I have to process the folder and the content .
>>
>> I need ouput has
>>
>> filename   word   occurance
>> vinitha       java       4
>> sony          oracle      3
>>
>>
>>
>> But iam not getting the filename.  Has the input file content are merged
>> file name is not getting correct .
>>
>>
>> please help in this issue to fix.  I have given by code below
>>
>>
>>  import java.io.IOException;
>>  import java.util.*;
>>  import org.apache.hadoop.fs.Path;
>>  import org.apache.hadoop.conf.*;
>>  import org.apache.hadoop.io.*;
>>  import org.apache.hadoop.mapred.*;
>>  import org.apache.hadoop.util.*;
>> import java.io.File;
>> import java.io.FileReader;
>> import java.io.FileWriter;
>> import java.io.IOException;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.FileStatus;
>> import org.apache.hadoop.conf.*;
>> import org.apache.hadoop.io.*;
>> import org.apache.hadoop.mapred.*;
>> import org.apache.hadoop.util.*;
>> import org.apache.hadoop.mapred.lib.*;
>>
>>  public class WordCount {
>>     public static class Map extends MapReduceBase implements
>> Mapper<LongWritable, Text, Text, IntWritable> {
>>      private final static IntWritable one = new IntWritable(1);
>>       private Text word = new Text();
>>       public void map(LongWritable key, Text value, OutputCollector<Text,
>> IntWritable> output, Reporter reporter) throws IOException {
>>    FSDataInputStream fs=null;
>>    FileSystem hdfs = null;
>>    String line = value.toString();
>>          int i=0,k=0;
>>   try{
>>    Configuration configuration = new Configuration();
>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>
>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>
>>    hdfs = FileSystem.get(configuration);
>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>    fs=hdfs.open(srcPath);
>>    BufferedReader br=new BufferedReader(new
>> InputStreamReader(hdfs.open(srcPath)));
>>
>> String[] splited = line.split("\\s+");
>>     for( i=0;i<splited.length;i++)
>>  {
>>      String sp[]=splited[i].split(",");
>>      for( k=0;k<sp.length;k++)
>>  {
>>
>>    if(!sp[k].isEmpty()){
>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>> if((sp[k].equalsIgnoreCase("C"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>>       }
>>     }
>> }
>>  } catch (IOException e) {
>>     e.printStackTrace();
>>  }
>> }
>> }
>>     public static class Reduce extends MapReduceBase implements
>> Reducer<Text, IntWritable, Text, IntWritable> {
>>       public void reduce(Text key, Iterator<IntWritable> values,
>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>> IOException {
>>         int sum = 0;
>>         while (values.hasNext()) {
>>           sum += values.next().get();
>>         }
>>         output.collect(key, new IntWritable(sum));
>>       }
>>     }
>>     public static void main(String[] args) throws Exception {
>>
>>
>>       JobConf conf = new JobConf(WordCount.class);
>>       conf.setJobName("wordcount");
>>       conf.setOutputKeyClass(Text.class);
>>       conf.setOutputValueClass(IntWritable.class);
>>       conf.setMapperClass(Map.class);
>>       conf.setCombinerClass(Reduce.class);
>>       conf.setReducerClass(Reduce.class);
>>       conf.setInputFormat(TextInputFormat.class);
>>       conf.setOutputFormat(TextOutputFormat.class);
>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>       JobClient.runJob(conf);
>>     }
>>  }
>>
>>
>>
>> Please help
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>>
>>
>> ----------
>> From: *Stanley Shi* <ss...@gopivotal.com>
>> Date: Thu, Mar 20, 2014 at 7:39 AM
>> To: user@hadoop.apache.org
>>
>>
>> You want to do a word count for each file, but the code give you a word
>> count for all the files, right?
>>
>> =====
>>  word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>> ======
>> change it to:
>> word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>>
>>
>>
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Thu, Mar 20, 2014 at 10:56 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Thu, Mar 20, 2014 at 11:20 AM
>> To: user@hadoop.apache.org, sshi@gopivotal.com
>>
>>
>> Hi,
>>
>> If we give the below code,
>> =======================
>>  word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>> ======================
>>
>> The output is wrong. because it shows the
>>
>>  filename   word   occurance
>> vinitha       java       4
>> vinitha         oracle      3
>> sony           java       4
>> sony          oracle      3
>>
>>
>> Here vinitha does not have oracle word . Similarlly sony does not have
>> java has word. File name is merging for  all words.
>>
>> I need the output has given below
>>
>>  filename   word   occurance
>>
>> vinitha       java       4
>> vinitha         C++    3
>> sony           ETL     4
>> sony          oracle      3
>>
>>
>>  Need fileaName along with the word in that particular file only. No
>> merge should happen.
>>
>> Please help me out for this issue.
>>
>> Please help.
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>> ----------
>> From: *Felix Chern* <id...@gmail.com>
>> Date: Thu, Mar 20, 2014 at 11:25 PM
>> To: user@hadoop.apache.org
>> Cc: sshi@gopivotal.com
>>
>>
>>  I've written two blog post of how to get directory context in hadoop
>> mapper.
>>
>>
>> http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
>>
>> http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/
>>
>> Cheers,
>> Felix
>>
>> ----------
>> From: *Stanley Shi* <ss...@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:02 AM
>>
>> To: Ranjini Rathinam <ra...@gmail.com>
>> Cc: user@hadoop.apache.org
>>
>>
>> Just reviewed the code again, you are not really using map-reduce. you
>> are reading all files in one map process, this is not a normal map-reduce
>> job works.
>>
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Stanley Shi* <ss...@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:43 AM
>> To: Ranjini Rathinam <ra...@gmail.com>
>> Cc: user@hadoop.apache.org
>>
>>
>> Change you mapper to be something like this:
>>
>>  public static class TokenizerMapper extends
>>
>>       Mapper<Object, Text, Text, IntWritable> {
>>
>>
>>     private final static IntWritable one = new IntWritable(1);
>>
>>     private Text word = new Text();
>>
>>
>>     public void map(Object key, Text value, Context context)
>>
>>         throws IOException, InterruptedException {
>>
>>       Path pp = ((FileSplit) context.getInputSplit()).getPath();
>>
>>       StringTokenizer itr = new StringTokenizer(value.toString());
>>
>>       log.info("map on string: " + new String(value.getBytes()));
>>
>>       while (itr.hasMoreTokens()) {
>>
>>         word.set(pp.getName() + " " + itr.nextToken());
>>
>>         context.write(word, one);
>>
>>       }
>>
>>     }
>>
>>   }
>>
>> Note: add your filtering code here;
>>
>> and then when running the command, use you input path as param;
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Fri, Mar 21, 2014 at 9:57 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>>
>>  ---------- Forwarded message ----------
>> From: Stanley Shi <ss...@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:43 AM
>> Subject: Re: Need FileName with Content
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Fri, Mar 21, 2014 at 9:58 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>>
>>
>

Re: Need FileName with Content

Posted by Ranjini Rathinam <ra...@gmail.com>.

Hi,

Thanks for the great support i have fixed the issue. I have now got
the output.

But , i have one query ,Possible to give runtime argument for mapper class

like,

Giving the value C,JAVA in runtime.



* if((sp[k].equalsIgnoreCase("C"))){*
                                    while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                    }

*    if((sp[k].equalsIgnoreCase("JAVA"))){*
                                         while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);

 Thanks a lot .

Ranjini



On Fri, Mar 21, 2014 at 11:45 AM, Ranjini Rathinam
<ra...@gmail.com>wrote:

> Hi,
>
>
> Thanks a lot for the great support. I am just learning hadoop and
> mapreduce.
>
> I have used the way you have guided me.
>
> But the output is coming without Aggreating
>
> vinitha.txt C    1
> vinitha.txt Java    1
> vinitha.txt Java    1
> vinitha.txt Java    1
> vinitha.txt Java    1
>
>
> *I need the output has *
>
>  *vinitha       C    1*
>
> *vinitha      Java  4*
>
>
> I have reduce class but still not able to fix it, I am still trying .
>
> I have given my code below, Please let me know where i have gone wrong.
>
>
> my code
>
>
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.io.LongWritable;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.io.Text;
> import org.apache.hadoop.mapreduce.InputSplit;
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
> import org.apache.hadoop.mapreduce.*;
> import org.apache.hadoop.mapreduce.Job;
> import org.apache.hadoop.mapreduce.Mapper;
> import org.apache.hadoop.mapreduce.Reducer;
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import java.util.*;
> import java.util.logging.Level;
> import java.util.logging.Logger;
>
>  public class FileCount {
>     public static class TokenizerMapper extends Mapper<LongWritable, Text,
> Text, IntWritable> {
>
>
>     private final static IntWritable one = new IntWritable(1);
>
>     private Text word = new Text();
>
>
>     public void map(LongWritable key, Text value, Context context) throws
> IOException, InterruptedException {
>
>             FileSplit fileSplit;
>               InputSplit is = context.getInputSplit();
>               FileSystem fs = FileSystem.get(context.getConfiguration());
>               fileSplit = (FileSplit) is;
>               Path pp = fileSplit.getPath();
>                     String line=value.toString();
>                     int i=0;int k=0;
>                     //Path pp = ((FileSplit)
> context.getInputSplit()).getPath();
>
>                     String[] splited = line.split("\\s+");
>                         for( i=0;i<splited.length;i++)
>                             {
>                                  String sp[]=splited[i].split(",");
>                          for( k=0;k<sp.length;k++)
>                             {
>
>                                if(!sp[k].isEmpty())
>                             {
>
>                                   StringTokenizer itr = new
> StringTokenizer(sp[k]);
>
>                                   //log.info("map on string: " + new
> String(value.getBytes()));
>
>                                 if((sp[k].equalsIgnoreCase("C"))){
>                                     while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>                                         }
>                                     }
>                                 if((sp[k].equalsIgnoreCase("JAVA"))){
>                                          while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>                                         }
>                                 }
>                              }
>                             }
>                         }
>
>           }
>
>   }
>
>   public static class Reduce extends Reducer<Text, IntWritable, Text,
> IntWritable> {
>
>     public void reduce(Text key, Iterator<IntWritable> values, Context
> context) throws IOException, InterruptedException {
>
>
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>        context.write(key, new IntWritable(sum));
>
>       }
>     }
>     public static void main(String[] args) throws Exception {
>             Configuration conf = new Configuration();
> Job job = new Job(conf, "jobName");
>
> String input="/user/hduser/INPUT/";
> String output="/user/hduser/OUTPUT/";
> FileInputFormat.setInputPaths(job, input);
> job.setJarByClass(FileCount.class);
> job.setMapperClass(TokenizerMapper.class);
> job.setReducerClass(Reduce.class);
> job.setCombinerClass(Reduce.class);
> job.setInputFormatClass(TextInputFormat.class);
> job.setOutputKeyClass(Text.class);
> job.setOutputValueClass(IntWritable.class);
> Path outPath = new Path(output);
> FileOutputFormat.setOutputPath(job, outPath);
> FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
> if (dfs.exists(outPath)) {
> dfs.delete(outPath, true);
> }
>
>
> try {
>
> job.waitForCompletion(true);
>
> } catch (InterruptedException ex) {
> //Logger.getLogger(FileCOunt.class.getName()).log(Level.SEVERE, null, ex);
> } catch (ClassNotFoundException ex) {
> //Logger.getLogger(FileCount.class.getName()).log(Level.SEVERE, null, ex);
> }
>
> }
>
> }
>
>
> Thanks in advance for the great help and support to fix the issue .
>
> Please help to fix it.
>
> Thanks a lot.
>
> Regards,
> Ranjini
>
>
>> Hi,
>>
>> I have folder named INPUT.
>>
>> Inside INPUT i have 5 resume are there.
>>
>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>> Found 5 items
>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/vinitha.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/sony.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/ravi.txt
>> hduser@localhost:~/Ranjini$
>>
>> I have to process the folder and the content .
>>
>> I need ouput has
>>
>> filename   word   occurance
>> vinitha       java       4
>> sony          oracle      3
>>
>>
>>
>> But iam not getting the filename.  Has the input file content are merged
>> file name is not getting correct .
>>
>>
>> please help in this issue to fix.  I have given by code below
>>
>>
>>  import java.io.IOException;
>>  import java.util.*;
>>  import org.apache.hadoop.fs.Path;
>>  import org.apache.hadoop.conf.*;
>>  import org.apache.hadoop.io.*;
>>  import org.apache.hadoop.mapred.*;
>>  import org.apache.hadoop.util.*;
>> import java.io.File;
>> import java.io.FileReader;
>> import java.io.FileWriter;
>> import java.io.IOException;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.FileStatus;
>> import org.apache.hadoop.conf.*;
>> import org.apache.hadoop.io.*;
>> import org.apache.hadoop.mapred.*;
>> import org.apache.hadoop.util.*;
>> import org.apache.hadoop.mapred.lib.*;
>>
>>  public class WordCount {
>>     public static class Map extends MapReduceBase implements
>> Mapper<LongWritable, Text, Text, IntWritable> {
>>      private final static IntWritable one = new IntWritable(1);
>>       private Text word = new Text();
>>       public void map(LongWritable key, Text value, OutputCollector<Text,
>> IntWritable> output, Reporter reporter) throws IOException {
>>    FSDataInputStream fs=null;
>>    FileSystem hdfs = null;
>>    String line = value.toString();
>>          int i=0,k=0;
>>   try{
>>    Configuration configuration = new Configuration();
>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>
>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>
>>    hdfs = FileSystem.get(configuration);
>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>    fs=hdfs.open(srcPath);
>>    BufferedReader br=new BufferedReader(new
>> InputStreamReader(hdfs.open(srcPath)));
>>
>> String[] splited = line.split("\\s+");
>>     for( i=0;i<splited.length;i++)
>>  {
>>      String sp[]=splited[i].split(",");
>>      for( k=0;k<sp.length;k++)
>>  {
>>
>>    if(!sp[k].isEmpty()){
>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>> if((sp[k].equalsIgnoreCase("C"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>>       }
>>     }
>> }
>>  } catch (IOException e) {
>>     e.printStackTrace();
>>  }
>> }
>> }
>>     public static class Reduce extends MapReduceBase implements
>> Reducer<Text, IntWritable, Text, IntWritable> {
>>       public void reduce(Text key, Iterator<IntWritable> values,
>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>> IOException {
>>         int sum = 0;
>>         while (values.hasNext()) {
>>           sum += values.next().get();
>>         }
>>         output.collect(key, new IntWritable(sum));
>>       }
>>     }
>>     public static void main(String[] args) throws Exception {
>>
>>
>>       JobConf conf = new JobConf(WordCount.class);
>>       conf.setJobName("wordcount");
>>       conf.setOutputKeyClass(Text.class);
>>       conf.setOutputValueClass(IntWritable.class);
>>       conf.setMapperClass(Map.class);
>>       conf.setCombinerClass(Reduce.class);
>>       conf.setReducerClass(Reduce.class);
>>       conf.setInputFormat(TextInputFormat.class);
>>       conf.setOutputFormat(TextOutputFormat.class);
>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>       JobClient.runJob(conf);
>>     }
>>  }
>>
>>
>>
>> Please help
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>>
>>
>> ----------
>> From: *Stanley Shi* <ss...@gopivotal.com>
>> Date: Thu, Mar 20, 2014 at 7:39 AM
>> To: user@hadoop.apache.org
>>
>>
>> You want to do a word count for each file, but the code give you a word
>> count for all the files, right?
>>
>> =====
>>  word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>> ======
>> change it to:
>> word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>>
>>
>>
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Thu, Mar 20, 2014 at 10:56 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Thu, Mar 20, 2014 at 11:20 AM
>> To: user@hadoop.apache.org, sshi@gopivotal.com
>>
>>
>> Hi,
>>
>> If we give the below code,
>> =======================
>>  word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>> ======================
>>
>> The output is wrong. because it shows the
>>
>>  filename   word   occurance
>> vinitha       java       4
>> vinitha         oracle      3
>> sony           java       4
>> sony          oracle      3
>>
>>
>> Here vinitha does not have oracle word . Similarlly sony does not have
>> java has word. File name is merging for  all words.
>>
>> I need the output has given below
>>
>>  filename   word   occurance
>>
>> vinitha       java       4
>> vinitha         C++    3
>> sony           ETL     4
>> sony          oracle      3
>>
>>
>>  Need fileaName along with the word in that particular file only. No
>> merge should happen.
>>
>> Please help me out for this issue.
>>
>> Please help.
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>> ----------
>> From: *Felix Chern* <id...@gmail.com>
>> Date: Thu, Mar 20, 2014 at 11:25 PM
>> To: user@hadoop.apache.org
>> Cc: sshi@gopivotal.com
>>
>>
>>  I've written two blog post of how to get directory context in hadoop
>> mapper.
>>
>>
>> http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
>>
>> http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/
>>
>> Cheers,
>> Felix
>>
>> ----------
>> From: *Stanley Shi* <ss...@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:02 AM
>>
>> To: Ranjini Rathinam <ra...@gmail.com>
>> Cc: user@hadoop.apache.org
>>
>>
>> Just reviewed the code again, you are not really using map-reduce. you
>> are reading all files in one map process, this is not a normal map-reduce
>> job works.
>>
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Stanley Shi* <ss...@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:43 AM
>> To: Ranjini Rathinam <ra...@gmail.com>
>> Cc: user@hadoop.apache.org
>>
>>
>> Change you mapper to be something like this:
>>
>>  public static class TokenizerMapper extends
>>
>>       Mapper<Object, Text, Text, IntWritable> {
>>
>>
>>     private final static IntWritable one = new IntWritable(1);
>>
>>     private Text word = new Text();
>>
>>
>>     public void map(Object key, Text value, Context context)
>>
>>         throws IOException, InterruptedException {
>>
>>       Path pp = ((FileSplit) context.getInputSplit()).getPath();
>>
>>       StringTokenizer itr = new StringTokenizer(value.toString());
>>
>>       log.info("map on string: " + new String(value.getBytes()));
>>
>>       while (itr.hasMoreTokens()) {
>>
>>         word.set(pp.getName() + " " + itr.nextToken());
>>
>>         context.write(word, one);
>>
>>       }
>>
>>     }
>>
>>   }
>>
>> Note: add your filtering code here;
>>
>> and then when running the command, use you input path as param;
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Fri, Mar 21, 2014 at 9:57 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>>
>>  ---------- Forwarded message ----------
>> From: Stanley Shi <ss...@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:43 AM
>> Subject: Re: Need FileName with Content
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Fri, Mar 21, 2014 at 9:58 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>>
>>
>

Re: Need FileName with Content

Posted by Ranjini Rathinam <ra...@gmail.com>.

Hi,

Thanks for the great support i have fixed the issue. I have now got
the output.

But , i have one query ,Possible to give runtime argument for mapper class

like,

Giving the value C,JAVA in runtime.



* if((sp[k].equalsIgnoreCase("C"))){*
                                    while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                    }

*    if((sp[k].equalsIgnoreCase("JAVA"))){*
                                         while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);

 Thanks a lot .

Ranjini



On Fri, Mar 21, 2014 at 11:45 AM, Ranjini Rathinam
<ra...@gmail.com>wrote:

> Hi,
>
>
> Thanks a lot for the great support. I am just learning hadoop and
> mapreduce.
>
> I have used the way you have guided me.
>
> But the output is coming without Aggreating
>
> vinitha.txt C    1
> vinitha.txt Java    1
> vinitha.txt Java    1
> vinitha.txt Java    1
> vinitha.txt Java    1
>
>
> *I need the output has *
>
>  *vinitha       C    1*
>
> *vinitha      Java  4*
>
>
> I have reduce class but still not able to fix it, I am still trying .
>
> I have given my code below, Please let me know where i have gone wrong.
>
>
> my code
>
>
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.io.LongWritable;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.io.Text;
> import org.apache.hadoop.mapreduce.InputSplit;
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
> import org.apache.hadoop.mapreduce.*;
> import org.apache.hadoop.mapreduce.Job;
> import org.apache.hadoop.mapreduce.Mapper;
> import org.apache.hadoop.mapreduce.Reducer;
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import java.util.*;
> import java.util.logging.Level;
> import java.util.logging.Logger;
>
>  public class FileCount {
>     public static class TokenizerMapper extends Mapper<LongWritable, Text,
> Text, IntWritable> {
>
>
>     private final static IntWritable one = new IntWritable(1);
>
>     private Text word = new Text();
>
>
>     public void map(LongWritable key, Text value, Context context) throws
> IOException, InterruptedException {
>
>             FileSplit fileSplit;
>               InputSplit is = context.getInputSplit();
>               FileSystem fs = FileSystem.get(context.getConfiguration());
>               fileSplit = (FileSplit) is;
>               Path pp = fileSplit.getPath();
>                     String line=value.toString();
>                     int i=0;int k=0;
>                     //Path pp = ((FileSplit)
> context.getInputSplit()).getPath();
>
>                     String[] splited = line.split("\\s+");
>                         for( i=0;i<splited.length;i++)
>                             {
>                                  String sp[]=splited[i].split(",");
>                          for( k=0;k<sp.length;k++)
>                             {
>
>                                if(!sp[k].isEmpty())
>                             {
>
>                                   StringTokenizer itr = new
> StringTokenizer(sp[k]);
>
>                                   //log.info("map on string: " + new
> String(value.getBytes()));
>
>                                 if((sp[k].equalsIgnoreCase("C"))){
>                                     while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>                                         }
>                                     }
>                                 if((sp[k].equalsIgnoreCase("JAVA"))){
>                                          while (itr.hasMoreTokens()) {
>                                            word.set(pp.getName() + " " +
> itr.nextToken());
>
>                                         context.write(word, one);
>                                         }
>                                 }
>                              }
>                             }
>                         }
>
>           }
>
>   }
>
>   public static class Reduce extends Reducer<Text, IntWritable, Text,
> IntWritable> {
>
>     public void reduce(Text key, Iterator<IntWritable> values, Context
> context) throws IOException, InterruptedException {
>
>
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>        context.write(key, new IntWritable(sum));
>
>       }
>     }
>     public static void main(String[] args) throws Exception {
>             Configuration conf = new Configuration();
> Job job = new Job(conf, "jobName");
>
> String input="/user/hduser/INPUT/";
> String output="/user/hduser/OUTPUT/";
> FileInputFormat.setInputPaths(job, input);
> job.setJarByClass(FileCount.class);
> job.setMapperClass(TokenizerMapper.class);
> job.setReducerClass(Reduce.class);
> job.setCombinerClass(Reduce.class);
> job.setInputFormatClass(TextInputFormat.class);
> job.setOutputKeyClass(Text.class);
> job.setOutputValueClass(IntWritable.class);
> Path outPath = new Path(output);
> FileOutputFormat.setOutputPath(job, outPath);
> FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
> if (dfs.exists(outPath)) {
> dfs.delete(outPath, true);
> }
>
>
> try {
>
> job.waitForCompletion(true);
>
> } catch (InterruptedException ex) {
> //Logger.getLogger(FileCOunt.class.getName()).log(Level.SEVERE, null, ex);
> } catch (ClassNotFoundException ex) {
> //Logger.getLogger(FileCount.class.getName()).log(Level.SEVERE, null, ex);
> }
>
> }
>
> }
>
>
> Thanks in advance for the great help and support to fix the issue .
>
> Please help to fix it.
>
> Thanks a lot.
>
> Regards,
> Ranjini
>
>
>> Hi,
>>
>> I have folder named INPUT.
>>
>> Inside INPUT i have 5 resume are there.
>>
>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>> Found 5 items
>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/vinitha.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/sony.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/ravi.txt
>> hduser@localhost:~/Ranjini$
>>
>> I have to process the folder and the content .
>>
>> I need ouput has
>>
>> filename   word   occurance
>> vinitha       java       4
>> sony          oracle      3
>>
>>
>>
>> But iam not getting the filename.  Has the input file content are merged
>> file name is not getting correct .
>>
>>
>> please help in this issue to fix.  I have given by code below
>>
>>
>>  import java.io.IOException;
>>  import java.util.*;
>>  import org.apache.hadoop.fs.Path;
>>  import org.apache.hadoop.conf.*;
>>  import org.apache.hadoop.io.*;
>>  import org.apache.hadoop.mapred.*;
>>  import org.apache.hadoop.util.*;
>> import java.io.File;
>> import java.io.FileReader;
>> import java.io.FileWriter;
>> import java.io.IOException;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.FileStatus;
>> import org.apache.hadoop.conf.*;
>> import org.apache.hadoop.io.*;
>> import org.apache.hadoop.mapred.*;
>> import org.apache.hadoop.util.*;
>> import org.apache.hadoop.mapred.lib.*;
>>
>>  public class WordCount {
>>     public static class Map extends MapReduceBase implements
>> Mapper<LongWritable, Text, Text, IntWritable> {
>>      private final static IntWritable one = new IntWritable(1);
>>       private Text word = new Text();
>>       public void map(LongWritable key, Text value, OutputCollector<Text,
>> IntWritable> output, Reporter reporter) throws IOException {
>>    FSDataInputStream fs=null;
>>    FileSystem hdfs = null;
>>    String line = value.toString();
>>          int i=0,k=0;
>>   try{
>>    Configuration configuration = new Configuration();
>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>
>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>
>>    hdfs = FileSystem.get(configuration);
>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>    fs=hdfs.open(srcPath);
>>    BufferedReader br=new BufferedReader(new
>> InputStreamReader(hdfs.open(srcPath)));
>>
>> String[] splited = line.split("\\s+");
>>     for( i=0;i<splited.length;i++)
>>  {
>>      String sp[]=splited[i].split(",");
>>      for( k=0;k<sp.length;k++)
>>  {
>>
>>    if(!sp[k].isEmpty()){
>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>> if((sp[k].equalsIgnoreCase("C"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>>       }
>>     }
>> }
>>  } catch (IOException e) {
>>     e.printStackTrace();
>>  }
>> }
>> }
>>     public static class Reduce extends MapReduceBase implements
>> Reducer<Text, IntWritable, Text, IntWritable> {
>>       public void reduce(Text key, Iterator<IntWritable> values,
>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>> IOException {
>>         int sum = 0;
>>         while (values.hasNext()) {
>>           sum += values.next().get();
>>         }
>>         output.collect(key, new IntWritable(sum));
>>       }
>>     }
>>     public static void main(String[] args) throws Exception {
>>
>>
>>       JobConf conf = new JobConf(WordCount.class);
>>       conf.setJobName("wordcount");
>>       conf.setOutputKeyClass(Text.class);
>>       conf.setOutputValueClass(IntWritable.class);
>>       conf.setMapperClass(Map.class);
>>       conf.setCombinerClass(Reduce.class);
>>       conf.setReducerClass(Reduce.class);
>>       conf.setInputFormat(TextInputFormat.class);
>>       conf.setOutputFormat(TextOutputFormat.class);
>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>       JobClient.runJob(conf);
>>     }
>>  }
>>
>>
>>
>> Please help
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>>
>>
>> ----------
>> From: *Stanley Shi* <ss...@gopivotal.com>
>> Date: Thu, Mar 20, 2014 at 7:39 AM
>> To: user@hadoop.apache.org
>>
>>
>> You want to do a word count for each file, but the code give you a word
>> count for all the files, right?
>>
>> =====
>>  word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>> ======
>> change it to:
>> word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>>
>>
>>
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Thu, Mar 20, 2014 at 10:56 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Thu, Mar 20, 2014 at 11:20 AM
>> To: user@hadoop.apache.org, sshi@gopivotal.com
>>
>>
>> Hi,
>>
>> If we give the below code,
>> =======================
>>  word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>> ======================
>>
>> The output is wrong. because it shows the
>>
>>  filename   word   occurance
>> vinitha       java       4
>> vinitha         oracle      3
>> sony           java       4
>> sony          oracle      3
>>
>>
>> Here vinitha does not have oracle word . Similarlly sony does not have
>> java has word. File name is merging for  all words.
>>
>> I need the output has given below
>>
>>  filename   word   occurance
>>
>> vinitha       java       4
>> vinitha         C++    3
>> sony           ETL     4
>> sony          oracle      3
>>
>>
>>  Need fileaName along with the word in that particular file only. No
>> merge should happen.
>>
>> Please help me out for this issue.
>>
>> Please help.
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>> ----------
>> From: *Felix Chern* <id...@gmail.com>
>> Date: Thu, Mar 20, 2014 at 11:25 PM
>> To: user@hadoop.apache.org
>> Cc: sshi@gopivotal.com
>>
>>
>>  I've written two blog post of how to get directory context in hadoop
>> mapper.
>>
>>
>> http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
>>
>> http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/
>>
>> Cheers,
>> Felix
>>
>> ----------
>> From: *Stanley Shi* <ss...@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:02 AM
>>
>> To: Ranjini Rathinam <ra...@gmail.com>
>> Cc: user@hadoop.apache.org
>>
>>
>> Just reviewed the code again, you are not really using map-reduce. you
>> are reading all files in one map process, this is not a normal map-reduce
>> job works.
>>
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Stanley Shi* <ss...@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:43 AM
>> To: Ranjini Rathinam <ra...@gmail.com>
>> Cc: user@hadoop.apache.org
>>
>>
>> Change you mapper to be something like this:
>>
>>  public static class TokenizerMapper extends
>>
>>       Mapper<Object, Text, Text, IntWritable> {
>>
>>
>>     private final static IntWritable one = new IntWritable(1);
>>
>>     private Text word = new Text();
>>
>>
>>     public void map(Object key, Text value, Context context)
>>
>>         throws IOException, InterruptedException {
>>
>>       Path pp = ((FileSplit) context.getInputSplit()).getPath();
>>
>>       StringTokenizer itr = new StringTokenizer(value.toString());
>>
>>       log.info("map on string: " + new String(value.getBytes()));
>>
>>       while (itr.hasMoreTokens()) {
>>
>>         word.set(pp.getName() + " " + itr.nextToken());
>>
>>         context.write(word, one);
>>
>>       }
>>
>>     }
>>
>>   }
>>
>> Note: add your filtering code here;
>>
>> and then when running the command, use you input path as param;
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Fri, Mar 21, 2014 at 9:57 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>>
>>  ---------- Forwarded message ----------
>> From: Stanley Shi <ss...@gopivotal.com>
>> Date: Fri, Mar 21, 2014 at 7:43 AM
>> Subject: Re: Need FileName with Content
>>
>>
>> ----------
>> From: *Ranjini Rathinam* <ra...@gmail.com>
>> Date: Fri, Mar 21, 2014 at 9:58 AM
>> To: ranjini.r@polarisft.com
>>
>>
>>
>>
>>
>

Re: Need FileName with Content

Posted by Ranjini Rathinam <ra...@gmail.com>.

Hi,


Thanks a lot for the great support. I am just learning hadoop and
mapreduce.

I have used the way you have guided me.

But the output is coming without Aggreating

vinitha.txt C    1
vinitha.txt Java    1
vinitha.txt Java    1
vinitha.txt Java    1
vinitha.txt Java    1


*I need the output has *

 *vinitha       C    1*

*vinitha      Java  4*


I have reduce class but still not able to fix it, I am still trying .

I have given my code below, Please let me know where i have gone wrong.


my code


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;

 public class FileCount {
    public static class TokenizerMapper extends Mapper<LongWritable, Text,
Text, IntWritable> {

    private final static IntWritable one = new IntWritable(1);

    private Text word = new Text();


    public void map(LongWritable key, Text value, Context context) throws
IOException, InterruptedException {

            FileSplit fileSplit;
              InputSplit is = context.getInputSplit();
              FileSystem fs = FileSystem.get(context.getConfiguration());
              fileSplit = (FileSplit) is;
              Path pp = fileSplit.getPath();
                    String line=value.toString();
                    int i=0;int k=0;
                    //Path pp = ((FileSplit)
context.getInputSplit()).getPath();

                    String[] splited = line.split("\\s+");
                        for( i=0;i<splited.length;i++)
                            {
                                 String sp[]=splited[i].split(",");
                         for( k=0;k<sp.length;k++)
                            {

                               if(!sp[k].isEmpty())
                            {

                                  StringTokenizer itr = new
StringTokenizer(sp[k]);

                                  //log.info("map on string: " + new
String(value.getBytes()));
                                if((sp[k].equalsIgnoreCase("C"))){
                                    while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                    }
                                if((sp[k].equalsIgnoreCase("JAVA"))){
                                         while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                }
                             }
                            }
                        }

          }

  }

  public static class Reduce extends Reducer<Text, IntWritable, Text,
IntWritable> {

    public void reduce(Text key, Iterator<IntWritable> values, Context
context) throws IOException, InterruptedException {

        int sum = 0;
        while (values.hasNext()) {
          sum += values.next().get();
        }
       context.write(key, new IntWritable(sum));
      }
    }
    public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
Job job = new Job(conf, "jobName");

String input="/user/hduser/INPUT/";
String output="/user/hduser/OUTPUT/";
FileInputFormat.setInputPaths(job, input);
job.setJarByClass(FileCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(Reduce.class);
job.setCombinerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
Path outPath = new Path(output);
FileOutputFormat.setOutputPath(job, outPath);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}


try {

job.waitForCompletion(true);

} catch (InterruptedException ex) {
//Logger.getLogger(FileCOunt.class.getName()).log(Level.SEVERE, null, ex);
} catch (ClassNotFoundException ex) {
//Logger.getLogger(FileCount.class.getName()).log(Level.SEVERE, null, ex);
}

}

}


Thanks in advance for the great help and support to fix the issue .

Please help to fix it.

Thanks a lot.

Regards,
Ranjini


> Hi,
>
> I have folder named INPUT.
>
> Inside INPUT i have 5 resume are there.
>
> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
> Found 5 items
> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/vinitha.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/sony.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/ravi.txt
> hduser@localhost:~/Ranjini$
>
> I have to process the folder and the content .
>
> I need ouput has
>
> filename   word   occurance
> vinitha       java       4
> sony          oracle      3
>
>
>
> But iam not getting the filename.  Has the input file content are merged
> file name is not getting correct .
>
>
> please help in this issue to fix.  I have given by code below
>
>
>  import java.io.IOException;
>  import java.util.*;
>  import org.apache.hadoop.fs.Path;
>  import org.apache.hadoop.conf.*;
>  import org.apache.hadoop.io.*;
>  import org.apache.hadoop.mapred.*;
>  import org.apache.hadoop.util.*;
> import java.io.File;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapred.lib.*;
>
>  public class WordCount {
>     public static class Map extends MapReduceBase implements
> Mapper<LongWritable, Text, Text, IntWritable> {
>      private final static IntWritable one = new IntWritable(1);
>       private Text word = new Text();
>       public void map(LongWritable key, Text value, OutputCollector<Text,
> IntWritable> output, Reporter reporter) throws IOException {
>    FSDataInputStream fs=null;
>    FileSystem hdfs = null;
>    String line = value.toString();
>          int i=0,k=0;
>   try{
>    Configuration configuration = new Configuration();
>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>
>    Path srcPath = new Path("/user/hduser/INPUT/");
>
>    hdfs = FileSystem.get(configuration);
>    FileStatus[] status = hdfs.listStatus(srcPath);
>    fs=hdfs.open(srcPath);
>    BufferedReader br=new BufferedReader(new
> InputStreamReader(hdfs.open(srcPath)));
>
> String[] splited = line.split("\\s+");
>     for( i=0;i<splited.length;i++)
>  {
>      String sp[]=splited[i].split(",");
>      for( k=0;k<sp.length;k++)
>  {
>
>    if(!sp[k].isEmpty()){
> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
> if((sp[k].equalsIgnoreCase("C"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
> if((sp[k].equalsIgnoreCase("JAVA"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
>       }
>     }
> }
>  } catch (IOException e) {
>     e.printStackTrace();
>  }
> }
> }
>     public static class Reduce extends MapReduceBase implements
> Reducer<Text, IntWritable, Text, IntWritable> {
>       public void reduce(Text key, Iterator<IntWritable> values,
> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
> IOException {
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>         output.collect(key, new IntWritable(sum));
>       }
>     }
>     public static void main(String[] args) throws Exception {
>
>
>       JobConf conf = new JobConf(WordCount.class);
>       conf.setJobName("wordcount");
>       conf.setOutputKeyClass(Text.class);
>       conf.setOutputValueClass(IntWritable.class);
>       conf.setMapperClass(Map.class);
>       conf.setCombinerClass(Reduce.class);
>       conf.setReducerClass(Reduce.class);
>       conf.setInputFormat(TextInputFormat.class);
>       conf.setOutputFormat(TextOutputFormat.class);
>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>       JobClient.runJob(conf);
>     }
>  }
>
>
>
> Please help
>
> Thanks in advance.
>
> Ranjini
>
>
>
> ----------
> From: *Stanley Shi* <ss...@gopivotal.com>
> Date: Thu, Mar 20, 2014 at 7:39 AM
> To: user@hadoop.apache.org
>
>
> You want to do a word count for each file, but the code give you a word
> count for all the files, right?
>
> =====
>  word.set(tokenizer.nextToken());
>           output.collect(word, one);
> ======
> change it to:
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
>
>
>
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Thu, Mar 20, 2014 at 10:56 AM
> To: ranjini.r@polarisft.com
>
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Thu, Mar 20, 2014 at 11:20 AM
> To: user@hadoop.apache.org, sshi@gopivotal.com
>
>
> Hi,
>
> If we give the below code,
> =======================
>  word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> ======================
>
> The output is wrong. because it shows the
>
>  filename   word   occurance
> vinitha       java       4
> vinitha         oracle      3
> sony           java       4
> sony          oracle      3
>
>
> Here vinitha does not have oracle word . Similarlly sony does not have
> java has word. File name is merging for  all words.
>
> I need the output has given below
>
>  filename   word   occurance
>
> vinitha       java       4
> vinitha         C++    3
> sony           ETL     4
> sony          oracle      3
>
>
>  Need fileaName along with the word in that particular file only. No merge
> should happen.
>
> Please help me out for this issue.
>
> Please help.
>
> Thanks in advance.
>
> Ranjini
>
> ----------
> From: *Felix Chern* <id...@gmail.com>
> Date: Thu, Mar 20, 2014 at 11:25 PM
> To: user@hadoop.apache.org
> Cc: sshi@gopivotal.com
>
>
>  I've written two blog post of how to get directory context in hadoop
> mapper.
>
>
> http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
>
> http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/
>
> Cheers,
> Felix
>
> ----------
> From: *Stanley Shi* <ss...@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:02 AM
>
> To: Ranjini Rathinam <ra...@gmail.com>
> Cc: user@hadoop.apache.org
>
>
> Just reviewed the code again, you are not really using map-reduce. you are
> reading all files in one map process, this is not a normal map-reduce job
> works.
>
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Stanley Shi* <ss...@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:43 AM
> To: Ranjini Rathinam <ra...@gmail.com>
> Cc: user@hadoop.apache.org
>
>
> Change you mapper to be something like this:
>
>  public static class TokenizerMapper extends
>
>       Mapper<Object, Text, Text, IntWritable> {
>
>
>     private final static IntWritable one = new IntWritable(1);
>
>     private Text word = new Text();
>
>
>     public void map(Object key, Text value, Context context)
>
>         throws IOException, InterruptedException {
>
>       Path pp = ((FileSplit) context.getInputSplit()).getPath();
>
>       StringTokenizer itr = new StringTokenizer(value.toString());
>
>       log.info("map on string: " + new String(value.getBytes()));
>
>       while (itr.hasMoreTokens()) {
>
>         word.set(pp.getName() + " " + itr.nextToken());
>
>         context.write(word, one);
>
>       }
>
>     }
>
>   }
>
> Note: add your filtering code here;
>
> and then when running the command, use you input path as param;
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Fri, Mar 21, 2014 at 9:57 AM
> To: ranjini.r@polarisft.com
>
>
>
>
>  ---------- Forwarded message ----------
> From: Stanley Shi <ss...@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:43 AM
> Subject: Re: Need FileName with Content
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Fri, Mar 21, 2014 at 9:58 AM
> To: ranjini.r@polarisft.com
>
>
>
>
>

Re: Need FileName with Content

Posted by Ranjini Rathinam <ra...@gmail.com>.

Hi,


Thanks a lot for the great support. I am just learning hadoop and
mapreduce.

I have used the way you have guided me.

But the output is coming without Aggreating

vinitha.txt C    1
vinitha.txt Java    1
vinitha.txt Java    1
vinitha.txt Java    1
vinitha.txt Java    1


*I need the output has *

 *vinitha       C    1*

*vinitha      Java  4*


I have reduce class but still not able to fix it, I am still trying .

I have given my code below, Please let me know where i have gone wrong.


my code


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;

 public class FileCount {
    public static class TokenizerMapper extends Mapper<LongWritable, Text,
Text, IntWritable> {

    private final static IntWritable one = new IntWritable(1);

    private Text word = new Text();


    public void map(LongWritable key, Text value, Context context) throws
IOException, InterruptedException {

            FileSplit fileSplit;
              InputSplit is = context.getInputSplit();
              FileSystem fs = FileSystem.get(context.getConfiguration());
              fileSplit = (FileSplit) is;
              Path pp = fileSplit.getPath();
                    String line=value.toString();
                    int i=0;int k=0;
                    //Path pp = ((FileSplit)
context.getInputSplit()).getPath();

                    String[] splited = line.split("\\s+");
                        for( i=0;i<splited.length;i++)
                            {
                                 String sp[]=splited[i].split(",");
                         for( k=0;k<sp.length;k++)
                            {

                               if(!sp[k].isEmpty())
                            {

                                  StringTokenizer itr = new
StringTokenizer(sp[k]);

                                  //log.info("map on string: " + new
String(value.getBytes()));
                                if((sp[k].equalsIgnoreCase("C"))){
                                    while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                    }
                                if((sp[k].equalsIgnoreCase("JAVA"))){
                                         while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                }
                             }
                            }
                        }

          }

  }

  public static class Reduce extends Reducer<Text, IntWritable, Text,
IntWritable> {

    public void reduce(Text key, Iterator<IntWritable> values, Context
context) throws IOException, InterruptedException {

        int sum = 0;
        while (values.hasNext()) {
          sum += values.next().get();
        }
       context.write(key, new IntWritable(sum));
      }
    }
    public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
Job job = new Job(conf, "jobName");

String input="/user/hduser/INPUT/";
String output="/user/hduser/OUTPUT/";
FileInputFormat.setInputPaths(job, input);
job.setJarByClass(FileCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(Reduce.class);
job.setCombinerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
Path outPath = new Path(output);
FileOutputFormat.setOutputPath(job, outPath);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}


try {

job.waitForCompletion(true);

} catch (InterruptedException ex) {
//Logger.getLogger(FileCOunt.class.getName()).log(Level.SEVERE, null, ex);
} catch (ClassNotFoundException ex) {
//Logger.getLogger(FileCount.class.getName()).log(Level.SEVERE, null, ex);
}

}

}


Thanks in advance for the great help and support to fix the issue .

Please help to fix it.

Thanks a lot.

Regards,
Ranjini


> Hi,
>
> I have folder named INPUT.
>
> Inside INPUT i have 5 resume are there.
>
> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
> Found 5 items
> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/vinitha.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/sony.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/ravi.txt
> hduser@localhost:~/Ranjini$
>
> I have to process the folder and the content .
>
> I need ouput has
>
> filename   word   occurance
> vinitha       java       4
> sony          oracle      3
>
>
>
> But iam not getting the filename.  Has the input file content are merged
> file name is not getting correct .
>
>
> please help in this issue to fix.  I have given by code below
>
>
>  import java.io.IOException;
>  import java.util.*;
>  import org.apache.hadoop.fs.Path;
>  import org.apache.hadoop.conf.*;
>  import org.apache.hadoop.io.*;
>  import org.apache.hadoop.mapred.*;
>  import org.apache.hadoop.util.*;
> import java.io.File;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapred.lib.*;
>
>  public class WordCount {
>     public static class Map extends MapReduceBase implements
> Mapper<LongWritable, Text, Text, IntWritable> {
>      private final static IntWritable one = new IntWritable(1);
>       private Text word = new Text();
>       public void map(LongWritable key, Text value, OutputCollector<Text,
> IntWritable> output, Reporter reporter) throws IOException {
>    FSDataInputStream fs=null;
>    FileSystem hdfs = null;
>    String line = value.toString();
>          int i=0,k=0;
>   try{
>    Configuration configuration = new Configuration();
>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>
>    Path srcPath = new Path("/user/hduser/INPUT/");
>
>    hdfs = FileSystem.get(configuration);
>    FileStatus[] status = hdfs.listStatus(srcPath);
>    fs=hdfs.open(srcPath);
>    BufferedReader br=new BufferedReader(new
> InputStreamReader(hdfs.open(srcPath)));
>
> String[] splited = line.split("\\s+");
>     for( i=0;i<splited.length;i++)
>  {
>      String sp[]=splited[i].split(",");
>      for( k=0;k<sp.length;k++)
>  {
>
>    if(!sp[k].isEmpty()){
> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
> if((sp[k].equalsIgnoreCase("C"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
> if((sp[k].equalsIgnoreCase("JAVA"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
>       }
>     }
> }
>  } catch (IOException e) {
>     e.printStackTrace();
>  }
> }
> }
>     public static class Reduce extends MapReduceBase implements
> Reducer<Text, IntWritable, Text, IntWritable> {
>       public void reduce(Text key, Iterator<IntWritable> values,
> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
> IOException {
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>         output.collect(key, new IntWritable(sum));
>       }
>     }
>     public static void main(String[] args) throws Exception {
>
>
>       JobConf conf = new JobConf(WordCount.class);
>       conf.setJobName("wordcount");
>       conf.setOutputKeyClass(Text.class);
>       conf.setOutputValueClass(IntWritable.class);
>       conf.setMapperClass(Map.class);
>       conf.setCombinerClass(Reduce.class);
>       conf.setReducerClass(Reduce.class);
>       conf.setInputFormat(TextInputFormat.class);
>       conf.setOutputFormat(TextOutputFormat.class);
>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>       JobClient.runJob(conf);
>     }
>  }
>
>
>
> Please help
>
> Thanks in advance.
>
> Ranjini
>
>
>
> ----------
> From: *Stanley Shi* <ss...@gopivotal.com>
> Date: Thu, Mar 20, 2014 at 7:39 AM
> To: user@hadoop.apache.org
>
>
> You want to do a word count for each file, but the code give you a word
> count for all the files, right?
>
> =====
>  word.set(tokenizer.nextToken());
>           output.collect(word, one);
> ======
> change it to:
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
>
>
>
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Thu, Mar 20, 2014 at 10:56 AM
> To: ranjini.r@polarisft.com
>
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Thu, Mar 20, 2014 at 11:20 AM
> To: user@hadoop.apache.org, sshi@gopivotal.com
>
>
> Hi,
>
> If we give the below code,
> =======================
>  word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> ======================
>
> The output is wrong. because it shows the
>
>  filename   word   occurance
> vinitha       java       4
> vinitha         oracle      3
> sony           java       4
> sony          oracle      3
>
>
> Here vinitha does not have oracle word . Similarlly sony does not have
> java has word. File name is merging for  all words.
>
> I need the output has given below
>
>  filename   word   occurance
>
> vinitha       java       4
> vinitha         C++    3
> sony           ETL     4
> sony          oracle      3
>
>
>  Need fileaName along with the word in that particular file only. No merge
> should happen.
>
> Please help me out for this issue.
>
> Please help.
>
> Thanks in advance.
>
> Ranjini
>
> ----------
> From: *Felix Chern* <id...@gmail.com>
> Date: Thu, Mar 20, 2014 at 11:25 PM
> To: user@hadoop.apache.org
> Cc: sshi@gopivotal.com
>
>
>  I've written two blog post of how to get directory context in hadoop
> mapper.
>
>
> http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
>
> http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/
>
> Cheers,
> Felix
>
> ----------
> From: *Stanley Shi* <ss...@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:02 AM
>
> To: Ranjini Rathinam <ra...@gmail.com>
> Cc: user@hadoop.apache.org
>
>
> Just reviewed the code again, you are not really using map-reduce. you are
> reading all files in one map process, this is not a normal map-reduce job
> works.
>
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Stanley Shi* <ss...@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:43 AM
> To: Ranjini Rathinam <ra...@gmail.com>
> Cc: user@hadoop.apache.org
>
>
> Change you mapper to be something like this:
>
>  public static class TokenizerMapper extends
>
>       Mapper<Object, Text, Text, IntWritable> {
>
>
>     private final static IntWritable one = new IntWritable(1);
>
>     private Text word = new Text();
>
>
>     public void map(Object key, Text value, Context context)
>
>         throws IOException, InterruptedException {
>
>       Path pp = ((FileSplit) context.getInputSplit()).getPath();
>
>       StringTokenizer itr = new StringTokenizer(value.toString());
>
>       log.info("map on string: " + new String(value.getBytes()));
>
>       while (itr.hasMoreTokens()) {
>
>         word.set(pp.getName() + " " + itr.nextToken());
>
>         context.write(word, one);
>
>       }
>
>     }
>
>   }
>
> Note: add your filtering code here;
>
> and then when running the command, use you input path as param;
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Fri, Mar 21, 2014 at 9:57 AM
> To: ranjini.r@polarisft.com
>
>
>
>
>  ---------- Forwarded message ----------
> From: Stanley Shi <ss...@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:43 AM
> Subject: Re: Need FileName with Content
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Fri, Mar 21, 2014 at 9:58 AM
> To: ranjini.r@polarisft.com
>
>
>
>
>

Re: Need FileName with Content

Posted by Ranjini Rathinam <ra...@gmail.com>.

Hi,


Thanks a lot for the great support. I am just learning hadoop and
mapreduce.

I have used the way you have guided me.

But the output is coming without Aggreating

vinitha.txt C    1
vinitha.txt Java    1
vinitha.txt Java    1
vinitha.txt Java    1
vinitha.txt Java    1


*I need the output has *

 *vinitha       C    1*

*vinitha      Java  4*


I have reduce class but still not able to fix it, I am still trying .

I have given my code below, Please let me know where i have gone wrong.


my code


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;

 public class FileCount {
    public static class TokenizerMapper extends Mapper<LongWritable, Text,
Text, IntWritable> {

    private final static IntWritable one = new IntWritable(1);

    private Text word = new Text();


    public void map(LongWritable key, Text value, Context context) throws
IOException, InterruptedException {

            FileSplit fileSplit;
              InputSplit is = context.getInputSplit();
              FileSystem fs = FileSystem.get(context.getConfiguration());
              fileSplit = (FileSplit) is;
              Path pp = fileSplit.getPath();
                    String line=value.toString();
                    int i=0;int k=0;
                    //Path pp = ((FileSplit)
context.getInputSplit()).getPath();

                    String[] splited = line.split("\\s+");
                        for( i=0;i<splited.length;i++)
                            {
                                 String sp[]=splited[i].split(",");
                         for( k=0;k<sp.length;k++)
                            {

                               if(!sp[k].isEmpty())
                            {

                                  StringTokenizer itr = new
StringTokenizer(sp[k]);

                                  //log.info("map on string: " + new
String(value.getBytes()));
                                if((sp[k].equalsIgnoreCase("C"))){
                                    while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                    }
                                if((sp[k].equalsIgnoreCase("JAVA"))){
                                         while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                }
                             }
                            }
                        }

          }

  }

  public static class Reduce extends Reducer<Text, IntWritable, Text,
IntWritable> {

    public void reduce(Text key, Iterator<IntWritable> values, Context
context) throws IOException, InterruptedException {

        int sum = 0;
        while (values.hasNext()) {
          sum += values.next().get();
        }
       context.write(key, new IntWritable(sum));
      }
    }
    public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
Job job = new Job(conf, "jobName");

String input="/user/hduser/INPUT/";
String output="/user/hduser/OUTPUT/";
FileInputFormat.setInputPaths(job, input);
job.setJarByClass(FileCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(Reduce.class);
job.setCombinerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
Path outPath = new Path(output);
FileOutputFormat.setOutputPath(job, outPath);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}


try {

job.waitForCompletion(true);

} catch (InterruptedException ex) {
//Logger.getLogger(FileCOunt.class.getName()).log(Level.SEVERE, null, ex);
} catch (ClassNotFoundException ex) {
//Logger.getLogger(FileCount.class.getName()).log(Level.SEVERE, null, ex);
}

}

}


Thanks in advance for the great help and support to fix the issue .

Please help to fix it.

Thanks a lot.

Regards,
Ranjini


> Hi,
>
> I have folder named INPUT.
>
> Inside INPUT i have 5 resume are there.
>
> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
> Found 5 items
> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/vinitha.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/sony.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/ravi.txt
> hduser@localhost:~/Ranjini$
>
> I have to process the folder and the content .
>
> I need ouput has
>
> filename   word   occurance
> vinitha       java       4
> sony          oracle      3
>
>
>
> But iam not getting the filename.  Has the input file content are merged
> file name is not getting correct .
>
>
> please help in this issue to fix.  I have given by code below
>
>
>  import java.io.IOException;
>  import java.util.*;
>  import org.apache.hadoop.fs.Path;
>  import org.apache.hadoop.conf.*;
>  import org.apache.hadoop.io.*;
>  import org.apache.hadoop.mapred.*;
>  import org.apache.hadoop.util.*;
> import java.io.File;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapred.lib.*;
>
>  public class WordCount {
>     public static class Map extends MapReduceBase implements
> Mapper<LongWritable, Text, Text, IntWritable> {
>      private final static IntWritable one = new IntWritable(1);
>       private Text word = new Text();
>       public void map(LongWritable key, Text value, OutputCollector<Text,
> IntWritable> output, Reporter reporter) throws IOException {
>    FSDataInputStream fs=null;
>    FileSystem hdfs = null;
>    String line = value.toString();
>          int i=0,k=0;
>   try{
>    Configuration configuration = new Configuration();
>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>
>    Path srcPath = new Path("/user/hduser/INPUT/");
>
>    hdfs = FileSystem.get(configuration);
>    FileStatus[] status = hdfs.listStatus(srcPath);
>    fs=hdfs.open(srcPath);
>    BufferedReader br=new BufferedReader(new
> InputStreamReader(hdfs.open(srcPath)));
>
> String[] splited = line.split("\\s+");
>     for( i=0;i<splited.length;i++)
>  {
>      String sp[]=splited[i].split(",");
>      for( k=0;k<sp.length;k++)
>  {
>
>    if(!sp[k].isEmpty()){
> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
> if((sp[k].equalsIgnoreCase("C"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
> if((sp[k].equalsIgnoreCase("JAVA"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
>       }
>     }
> }
>  } catch (IOException e) {
>     e.printStackTrace();
>  }
> }
> }
>     public static class Reduce extends MapReduceBase implements
> Reducer<Text, IntWritable, Text, IntWritable> {
>       public void reduce(Text key, Iterator<IntWritable> values,
> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
> IOException {
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>         output.collect(key, new IntWritable(sum));
>       }
>     }
>     public static void main(String[] args) throws Exception {
>
>
>       JobConf conf = new JobConf(WordCount.class);
>       conf.setJobName("wordcount");
>       conf.setOutputKeyClass(Text.class);
>       conf.setOutputValueClass(IntWritable.class);
>       conf.setMapperClass(Map.class);
>       conf.setCombinerClass(Reduce.class);
>       conf.setReducerClass(Reduce.class);
>       conf.setInputFormat(TextInputFormat.class);
>       conf.setOutputFormat(TextOutputFormat.class);
>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>       JobClient.runJob(conf);
>     }
>  }
>
>
>
> Please help
>
> Thanks in advance.
>
> Ranjini
>
>
>
> ----------
> From: *Stanley Shi* <ss...@gopivotal.com>
> Date: Thu, Mar 20, 2014 at 7:39 AM
> To: user@hadoop.apache.org
>
>
> You want to do a word count for each file, but the code give you a word
> count for all the files, right?
>
> =====
>  word.set(tokenizer.nextToken());
>           output.collect(word, one);
> ======
> change it to:
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
>
>
>
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Thu, Mar 20, 2014 at 10:56 AM
> To: ranjini.r@polarisft.com
>
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Thu, Mar 20, 2014 at 11:20 AM
> To: user@hadoop.apache.org, sshi@gopivotal.com
>
>
> Hi,
>
> If we give the below code,
> =======================
>  word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> ======================
>
> The output is wrong. because it shows the
>
>  filename   word   occurance
> vinitha       java       4
> vinitha         oracle      3
> sony           java       4
> sony          oracle      3
>
>
> Here vinitha does not have oracle word . Similarlly sony does not have
> java has word. File name is merging for  all words.
>
> I need the output has given below
>
>  filename   word   occurance
>
> vinitha       java       4
> vinitha         C++    3
> sony           ETL     4
> sony          oracle      3
>
>
>  Need fileaName along with the word in that particular file only. No merge
> should happen.
>
> Please help me out for this issue.
>
> Please help.
>
> Thanks in advance.
>
> Ranjini
>
> ----------
> From: *Felix Chern* <id...@gmail.com>
> Date: Thu, Mar 20, 2014 at 11:25 PM
> To: user@hadoop.apache.org
> Cc: sshi@gopivotal.com
>
>
>  I've written two blog post of how to get directory context in hadoop
> mapper.
>
>
> http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
>
> http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/
>
> Cheers,
> Felix
>
> ----------
> From: *Stanley Shi* <ss...@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:02 AM
>
> To: Ranjini Rathinam <ra...@gmail.com>
> Cc: user@hadoop.apache.org
>
>
> Just reviewed the code again, you are not really using map-reduce. you are
> reading all files in one map process, this is not a normal map-reduce job
> works.
>
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Stanley Shi* <ss...@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:43 AM
> To: Ranjini Rathinam <ra...@gmail.com>
> Cc: user@hadoop.apache.org
>
>
> Change you mapper to be something like this:
>
>  public static class TokenizerMapper extends
>
>       Mapper<Object, Text, Text, IntWritable> {
>
>
>     private final static IntWritable one = new IntWritable(1);
>
>     private Text word = new Text();
>
>
>     public void map(Object key, Text value, Context context)
>
>         throws IOException, InterruptedException {
>
>       Path pp = ((FileSplit) context.getInputSplit()).getPath();
>
>       StringTokenizer itr = new StringTokenizer(value.toString());
>
>       log.info("map on string: " + new String(value.getBytes()));
>
>       while (itr.hasMoreTokens()) {
>
>         word.set(pp.getName() + " " + itr.nextToken());
>
>         context.write(word, one);
>
>       }
>
>     }
>
>   }
>
> Note: add your filtering code here;
>
> and then when running the command, use you input path as param;
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Fri, Mar 21, 2014 at 9:57 AM
> To: ranjini.r@polarisft.com
>
>
>
>
>  ---------- Forwarded message ----------
> From: Stanley Shi <ss...@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:43 AM
> Subject: Re: Need FileName with Content
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Fri, Mar 21, 2014 at 9:58 AM
> To: ranjini.r@polarisft.com
>
>
>
>
>

Re: Need FileName with Content

Posted by Ranjini Rathinam <ra...@gmail.com>.

Hi,


Thanks a lot for the great support. I am just learning hadoop and
mapreduce.

I have used the way you have guided me.

But the output is coming without Aggreating

vinitha.txt C    1
vinitha.txt Java    1
vinitha.txt Java    1
vinitha.txt Java    1
vinitha.txt Java    1


*I need the output has *

 *vinitha       C    1*

*vinitha      Java  4*


I have reduce class but still not able to fix it, I am still trying .

I have given my code below, Please let me know where i have gone wrong.


my code


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;

 public class FileCount {
    public static class TokenizerMapper extends Mapper<LongWritable, Text,
Text, IntWritable> {

    private final static IntWritable one = new IntWritable(1);

    private Text word = new Text();


    public void map(LongWritable key, Text value, Context context) throws
IOException, InterruptedException {

            FileSplit fileSplit;
              InputSplit is = context.getInputSplit();
              FileSystem fs = FileSystem.get(context.getConfiguration());
              fileSplit = (FileSplit) is;
              Path pp = fileSplit.getPath();
                    String line=value.toString();
                    int i=0;int k=0;
                    //Path pp = ((FileSplit)
context.getInputSplit()).getPath();

                    String[] splited = line.split("\\s+");
                        for( i=0;i<splited.length;i++)
                            {
                                 String sp[]=splited[i].split(",");
                         for( k=0;k<sp.length;k++)
                            {

                               if(!sp[k].isEmpty())
                            {

                                  StringTokenizer itr = new
StringTokenizer(sp[k]);

                                  //log.info("map on string: " + new
String(value.getBytes()));
                                if((sp[k].equalsIgnoreCase("C"))){
                                    while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                    }
                                if((sp[k].equalsIgnoreCase("JAVA"))){
                                         while (itr.hasMoreTokens()) {
                                           word.set(pp.getName() + " " +
itr.nextToken());

                                        context.write(word, one);
                                        }
                                }
                             }
                            }
                        }

          }

  }

  public static class Reduce extends Reducer<Text, IntWritable, Text,
IntWritable> {

    public void reduce(Text key, Iterator<IntWritable> values, Context
context) throws IOException, InterruptedException {

        int sum = 0;
        while (values.hasNext()) {
          sum += values.next().get();
        }
       context.write(key, new IntWritable(sum));
      }
    }
    public static void main(String[] args) throws Exception {
            Configuration conf = new Configuration();
Job job = new Job(conf, "jobName");

String input="/user/hduser/INPUT/";
String output="/user/hduser/OUTPUT/";
FileInputFormat.setInputPaths(job, input);
job.setJarByClass(FileCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(Reduce.class);
job.setCombinerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
Path outPath = new Path(output);
FileOutputFormat.setOutputPath(job, outPath);
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}


try {

job.waitForCompletion(true);

} catch (InterruptedException ex) {
//Logger.getLogger(FileCOunt.class.getName()).log(Level.SEVERE, null, ex);
} catch (ClassNotFoundException ex) {
//Logger.getLogger(FileCount.class.getName()).log(Level.SEVERE, null, ex);
}

}

}


Thanks in advance for the great help and support to fix the issue .

Please help to fix it.

Thanks a lot.

Regards,
Ranjini


> Hi,
>
> I have folder named INPUT.
>
> Inside INPUT i have 5 resume are there.
>
> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
> Found 5 items
> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/vinitha.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/sony.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/ravi.txt
> hduser@localhost:~/Ranjini$
>
> I have to process the folder and the content .
>
> I need ouput has
>
> filename   word   occurance
> vinitha       java       4
> sony          oracle      3
>
>
>
> But iam not getting the filename.  Has the input file content are merged
> file name is not getting correct .
>
>
> please help in this issue to fix.  I have given by code below
>
>
>  import java.io.IOException;
>  import java.util.*;
>  import org.apache.hadoop.fs.Path;
>  import org.apache.hadoop.conf.*;
>  import org.apache.hadoop.io.*;
>  import org.apache.hadoop.mapred.*;
>  import org.apache.hadoop.util.*;
> import java.io.File;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapred.lib.*;
>
>  public class WordCount {
>     public static class Map extends MapReduceBase implements
> Mapper<LongWritable, Text, Text, IntWritable> {
>      private final static IntWritable one = new IntWritable(1);
>       private Text word = new Text();
>       public void map(LongWritable key, Text value, OutputCollector<Text,
> IntWritable> output, Reporter reporter) throws IOException {
>    FSDataInputStream fs=null;
>    FileSystem hdfs = null;
>    String line = value.toString();
>          int i=0,k=0;
>   try{
>    Configuration configuration = new Configuration();
>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>
>    Path srcPath = new Path("/user/hduser/INPUT/");
>
>    hdfs = FileSystem.get(configuration);
>    FileStatus[] status = hdfs.listStatus(srcPath);
>    fs=hdfs.open(srcPath);
>    BufferedReader br=new BufferedReader(new
> InputStreamReader(hdfs.open(srcPath)));
>
> String[] splited = line.split("\\s+");
>     for( i=0;i<splited.length;i++)
>  {
>      String sp[]=splited[i].split(",");
>      for( k=0;k<sp.length;k++)
>  {
>
>    if(!sp[k].isEmpty()){
> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
> if((sp[k].equalsIgnoreCase("C"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
> if((sp[k].equalsIgnoreCase("JAVA"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
>       }
>     }
> }
>  } catch (IOException e) {
>     e.printStackTrace();
>  }
> }
> }
>     public static class Reduce extends MapReduceBase implements
> Reducer<Text, IntWritable, Text, IntWritable> {
>       public void reduce(Text key, Iterator<IntWritable> values,
> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
> IOException {
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>         output.collect(key, new IntWritable(sum));
>       }
>     }
>     public static void main(String[] args) throws Exception {
>
>
>       JobConf conf = new JobConf(WordCount.class);
>       conf.setJobName("wordcount");
>       conf.setOutputKeyClass(Text.class);
>       conf.setOutputValueClass(IntWritable.class);
>       conf.setMapperClass(Map.class);
>       conf.setCombinerClass(Reduce.class);
>       conf.setReducerClass(Reduce.class);
>       conf.setInputFormat(TextInputFormat.class);
>       conf.setOutputFormat(TextOutputFormat.class);
>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>       JobClient.runJob(conf);
>     }
>  }
>
>
>
> Please help
>
> Thanks in advance.
>
> Ranjini
>
>
>
> ----------
> From: *Stanley Shi* <ss...@gopivotal.com>
> Date: Thu, Mar 20, 2014 at 7:39 AM
> To: user@hadoop.apache.org
>
>
> You want to do a word count for each file, but the code give you a word
> count for all the files, right?
>
> =====
>  word.set(tokenizer.nextToken());
>           output.collect(word, one);
> ======
> change it to:
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
>
>
>
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Thu, Mar 20, 2014 at 10:56 AM
> To: ranjini.r@polarisft.com
>
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Thu, Mar 20, 2014 at 11:20 AM
> To: user@hadoop.apache.org, sshi@gopivotal.com
>
>
> Hi,
>
> If we give the below code,
> =======================
>  word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> ======================
>
> The output is wrong. because it shows the
>
>  filename   word   occurance
> vinitha       java       4
> vinitha         oracle      3
> sony           java       4
> sony          oracle      3
>
>
> Here vinitha does not have oracle word . Similarlly sony does not have
> java has word. File name is merging for  all words.
>
> I need the output has given below
>
>  filename   word   occurance
>
> vinitha       java       4
> vinitha         C++    3
> sony           ETL     4
> sony          oracle      3
>
>
>  Need fileaName along with the word in that particular file only. No merge
> should happen.
>
> Please help me out for this issue.
>
> Please help.
>
> Thanks in advance.
>
> Ranjini
>
> ----------
> From: *Felix Chern* <id...@gmail.com>
> Date: Thu, Mar 20, 2014 at 11:25 PM
> To: user@hadoop.apache.org
> Cc: sshi@gopivotal.com
>
>
>  I've written two blog post of how to get directory context in hadoop
> mapper.
>
>
> http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
>
> http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/
>
> Cheers,
> Felix
>
> ----------
> From: *Stanley Shi* <ss...@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:02 AM
>
> To: Ranjini Rathinam <ra...@gmail.com>
> Cc: user@hadoop.apache.org
>
>
> Just reviewed the code again, you are not really using map-reduce. you are
> reading all files in one map process, this is not a normal map-reduce job
> works.
>
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Stanley Shi* <ss...@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:43 AM
> To: Ranjini Rathinam <ra...@gmail.com>
> Cc: user@hadoop.apache.org
>
>
> Change you mapper to be something like this:
>
>  public static class TokenizerMapper extends
>
>       Mapper<Object, Text, Text, IntWritable> {
>
>
>     private final static IntWritable one = new IntWritable(1);
>
>     private Text word = new Text();
>
>
>     public void map(Object key, Text value, Context context)
>
>         throws IOException, InterruptedException {
>
>       Path pp = ((FileSplit) context.getInputSplit()).getPath();
>
>       StringTokenizer itr = new StringTokenizer(value.toString());
>
>       log.info("map on string: " + new String(value.getBytes()));
>
>       while (itr.hasMoreTokens()) {
>
>         word.set(pp.getName() + " " + itr.nextToken());
>
>         context.write(word, one);
>
>       }
>
>     }
>
>   }
>
> Note: add your filtering code here;
>
> and then when running the command, use you input path as param;
>
>  Regards,
> *Stanley Shi,*
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Fri, Mar 21, 2014 at 9:57 AM
> To: ranjini.r@polarisft.com
>
>
>
>
>  ---------- Forwarded message ----------
> From: Stanley Shi <ss...@gopivotal.com>
> Date: Fri, Mar 21, 2014 at 7:43 AM
> Subject: Re: Need FileName with Content
>
>
> ----------
> From: *Ranjini Rathinam* <ra...@gmail.com>
> Date: Fri, Mar 21, 2014 at 9:58 AM
> To: ranjini.r@polarisft.com
>
>
>
>
>

Re: Need FileName with Content

Posted by Stanley Shi <ss...@gopivotal.com>.

Change you mapper to be something like this:

public static class TokenizerMapper extends

      Mapper<Object, Text, Text, IntWritable> {


    private final static IntWritable one = new IntWritable(1);

    private Text word = new Text();


    public void map(Object key, Text value, Context context)

        throws IOException, InterruptedException {

      Path pp = ((FileSplit) context.getInputSplit()).getPath();

      StringTokenizer itr = new StringTokenizer(value.toString());

      log.info("map on string: " + new String(value.getBytes()));

      while (itr.hasMoreTokens()) {

        word.set(pp.getName() + " " + itr.nextToken());

        context.write(word, one);

      }

    }

  }

Note: add your filtering code here;

and then when running the command, use you input path as param;

Regards,
*Stanley Shi,*



On Fri, Mar 21, 2014 at 9:32 AM, Stanley Shi <ss...@gopivotal.com> wrote:

> Just reviewed the code again, you are not really using map-reduce. you are
> reading all files in one map process, this is not a normal map-reduce job
> works.
>
>
> Regards,
> *Stanley Shi,*
>
>
>
> On Thu, Mar 20, 2014 at 1:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:
>
>> Hi,
>>
>> If we give the below code,
>> =======================
>> word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>> ======================
>>
>> The output is wrong. because it shows the
>>
>> filename   word   occurance
>> vinitha       java       4
>> vinitha         oracle      3
>> sony           java       4
>> sony          oracle      3
>>
>>
>> Here vinitha does not have oracle word . Similarlly sony does not have
>> java has word. File name is merging for  all words.
>>
>> I need the output has given below
>>
>>  filename   word   occurance
>>
>> vinitha       java       4
>> vinitha         C++    3
>> sony           ETL     4
>> sony          oracle      3
>>
>>
>>  Need fileaName along with the word in that particular file only. No
>> merge should happen.
>>
>> Please help me out for this issue.
>>
>> Please help.
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>>
>>
>>
>> On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam <
>> ranjinibecse@gmail.com> wrote:
>>
>>
>>>
>>> ---------- Forwarded message ----------
>>> From: Stanley Shi <ss...@gopivotal.com>
>>> Date: Thu, Mar 20, 2014 at 7:39 AM
>>> Subject: Re: Need FileName with Content
>>> To: user@hadoop.apache.org
>>>
>>>
>>> You want to do a word count for each file, but the code give you a word
>>> count for all the files, right?
>>>
>>> =====
>>>  word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>> ======
>>> change it to:
>>> word.set("filename"+"    "+tokenizer.nextToken());
>>> output.collect(word,one);
>>>
>>>
>>>
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>>
>>> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <
>>> ranjinibecse@gmail.com> wrote:
>>>
>>>> Hi,
>>>>
>>>> I have folder named INPUT.
>>>>
>>>> Inside INPUT i have 5 resume are there.
>>>>
>>>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>>>> Found 5 items
>>>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>>>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>>>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>>>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>>> /user/hduser/INPUT/vinitha.txt
>>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>>> /user/hduser/INPUT/sony.txt
>>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>>> /user/hduser/INPUT/ravi.txt
>>>> hduser@localhost:~/Ranjini$
>>>>
>>>> I have to process the folder and the content .
>>>>
>>>> I need ouput has
>>>>
>>>> filename   word   occurance
>>>> vinitha       java       4
>>>> sony          oracle      3
>>>>
>>>>
>>>>
>>>> But iam not getting the filename.  Has the input file content are
>>>> merged file name is not getting correct .
>>>>
>>>>
>>>> please help in this issue to fix.  I have given by code below
>>>>
>>>>
>>>>  import java.io.IOException;
>>>>  import java.util.*;
>>>>  import org.apache.hadoop.fs.Path;
>>>>  import org.apache.hadoop.conf.*;
>>>>  import org.apache.hadoop.io.*;
>>>>  import org.apache.hadoop.mapred.*;
>>>>  import org.apache.hadoop.util.*;
>>>> import java.io.File;
>>>> import java.io.FileReader;
>>>> import java.io.FileWriter;
>>>> import java.io.IOException;
>>>> import org.apache.hadoop.fs.Path;
>>>> import org.apache.hadoop.conf.Configuration;
>>>> import org.apache.hadoop.fs.FileSystem;
>>>> import org.apache.hadoop.fs.FileStatus;
>>>> import org.apache.hadoop.conf.*;
>>>> import org.apache.hadoop.io.*;
>>>> import org.apache.hadoop.mapred.*;
>>>> import org.apache.hadoop.util.*;
>>>> import org.apache.hadoop.mapred.lib.*;
>>>>
>>>>  public class WordCount {
>>>>     public static class Map extends MapReduceBase implements
>>>> Mapper<LongWritable, Text, Text, IntWritable> {
>>>>      private final static IntWritable one = new IntWritable(1);
>>>>       private Text word = new Text();
>>>>       public void map(LongWritable key, Text value,
>>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>>> IOException {
>>>>    FSDataInputStream fs=null;
>>>>    FileSystem hdfs = null;
>>>>    String line = value.toString();
>>>>          int i=0,k=0;
>>>>   try{
>>>>    Configuration configuration = new Configuration();
>>>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>>>
>>>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>>>
>>>>    hdfs = FileSystem.get(configuration);
>>>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>>>    fs=hdfs.open(srcPath);
>>>>    BufferedReader br=new BufferedReader(new
>>>> InputStreamReader(hdfs.open(srcPath)));
>>>>
>>>> String[] splited = line.split("\\s+");
>>>>     for( i=0;i<splited.length;i++)
>>>>  {
>>>>      String sp[]=splited[i].split(",");
>>>>      for( k=0;k<sp.length;k++)
>>>>  {
>>>>
>>>>    if(!sp[k].isEmpty()){
>>>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>>>> if((sp[k].equalsIgnoreCase("C"))){
>>>>         while (tokenizer.hasMoreTokens()) {
>>>>           word.set(tokenizer.nextToken());
>>>>           output.collect(word, one);
>>>>         }
>>>> }
>>>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>>>         while (tokenizer.hasMoreTokens()) {
>>>>           word.set(tokenizer.nextToken());
>>>>           output.collect(word, one);
>>>>         }
>>>> }
>>>>       }
>>>>     }
>>>> }
>>>>  } catch (IOException e) {
>>>>     e.printStackTrace();
>>>>  }
>>>> }
>>>> }
>>>>     public static class Reduce extends MapReduceBase implements
>>>> Reducer<Text, IntWritable, Text, IntWritable> {
>>>>       public void reduce(Text key, Iterator<IntWritable> values,
>>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>>> IOException {
>>>>         int sum = 0;
>>>>         while (values.hasNext()) {
>>>>           sum += values.next().get();
>>>>         }
>>>>         output.collect(key, new IntWritable(sum));
>>>>       }
>>>>     }
>>>>     public static void main(String[] args) throws Exception {
>>>>
>>>>
>>>>       JobConf conf = new JobConf(WordCount.class);
>>>>       conf.setJobName("wordcount");
>>>>       conf.setOutputKeyClass(Text.class);
>>>>       conf.setOutputValueClass(IntWritable.class);
>>>>       conf.setMapperClass(Map.class);
>>>>       conf.setCombinerClass(Reduce.class);
>>>>       conf.setReducerClass(Reduce.class);
>>>>       conf.setInputFormat(TextInputFormat.class);
>>>>       conf.setOutputFormat(TextOutputFormat.class);
>>>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>>>       JobClient.runJob(conf);
>>>>     }
>>>>  }
>>>>
>>>>
>>>>
>>>> Please help
>>>>
>>>> Thanks in advance.
>>>>
>>>> Ranjini
>>>>
>>>>
>>>>
>>>
>>>
>>
>

Re: Need FileName with Content

Posted by Stanley Shi <ss...@gopivotal.com>.

Change you mapper to be something like this:

public static class TokenizerMapper extends

      Mapper<Object, Text, Text, IntWritable> {


    private final static IntWritable one = new IntWritable(1);

    private Text word = new Text();


    public void map(Object key, Text value, Context context)

        throws IOException, InterruptedException {

      Path pp = ((FileSplit) context.getInputSplit()).getPath();

      StringTokenizer itr = new StringTokenizer(value.toString());

      log.info("map on string: " + new String(value.getBytes()));

      while (itr.hasMoreTokens()) {

        word.set(pp.getName() + " " + itr.nextToken());

        context.write(word, one);

      }

    }

  }

Note: add your filtering code here;

and then when running the command, use you input path as param;

Regards,
*Stanley Shi,*



On Fri, Mar 21, 2014 at 9:32 AM, Stanley Shi <ss...@gopivotal.com> wrote:

> Just reviewed the code again, you are not really using map-reduce. you are
> reading all files in one map process, this is not a normal map-reduce job
> works.
>
>
> Regards,
> *Stanley Shi,*
>
>
>
> On Thu, Mar 20, 2014 at 1:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:
>
>> Hi,
>>
>> If we give the below code,
>> =======================
>> word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>> ======================
>>
>> The output is wrong. because it shows the
>>
>> filename   word   occurance
>> vinitha       java       4
>> vinitha         oracle      3
>> sony           java       4
>> sony          oracle      3
>>
>>
>> Here vinitha does not have oracle word . Similarlly sony does not have
>> java has word. File name is merging for  all words.
>>
>> I need the output has given below
>>
>>  filename   word   occurance
>>
>> vinitha       java       4
>> vinitha         C++    3
>> sony           ETL     4
>> sony          oracle      3
>>
>>
>>  Need fileaName along with the word in that particular file only. No
>> merge should happen.
>>
>> Please help me out for this issue.
>>
>> Please help.
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>>
>>
>>
>> On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam <
>> ranjinibecse@gmail.com> wrote:
>>
>>
>>>
>>> ---------- Forwarded message ----------
>>> From: Stanley Shi <ss...@gopivotal.com>
>>> Date: Thu, Mar 20, 2014 at 7:39 AM
>>> Subject: Re: Need FileName with Content
>>> To: user@hadoop.apache.org
>>>
>>>
>>> You want to do a word count for each file, but the code give you a word
>>> count for all the files, right?
>>>
>>> =====
>>>  word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>> ======
>>> change it to:
>>> word.set("filename"+"    "+tokenizer.nextToken());
>>> output.collect(word,one);
>>>
>>>
>>>
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>>
>>> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <
>>> ranjinibecse@gmail.com> wrote:
>>>
>>>> Hi,
>>>>
>>>> I have folder named INPUT.
>>>>
>>>> Inside INPUT i have 5 resume are there.
>>>>
>>>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>>>> Found 5 items
>>>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>>>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>>>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>>>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>>> /user/hduser/INPUT/vinitha.txt
>>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>>> /user/hduser/INPUT/sony.txt
>>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>>> /user/hduser/INPUT/ravi.txt
>>>> hduser@localhost:~/Ranjini$
>>>>
>>>> I have to process the folder and the content .
>>>>
>>>> I need ouput has
>>>>
>>>> filename   word   occurance
>>>> vinitha       java       4
>>>> sony          oracle      3
>>>>
>>>>
>>>>
>>>> But iam not getting the filename.  Has the input file content are
>>>> merged file name is not getting correct .
>>>>
>>>>
>>>> please help in this issue to fix.  I have given by code below
>>>>
>>>>
>>>>  import java.io.IOException;
>>>>  import java.util.*;
>>>>  import org.apache.hadoop.fs.Path;
>>>>  import org.apache.hadoop.conf.*;
>>>>  import org.apache.hadoop.io.*;
>>>>  import org.apache.hadoop.mapred.*;
>>>>  import org.apache.hadoop.util.*;
>>>> import java.io.File;
>>>> import java.io.FileReader;
>>>> import java.io.FileWriter;
>>>> import java.io.IOException;
>>>> import org.apache.hadoop.fs.Path;
>>>> import org.apache.hadoop.conf.Configuration;
>>>> import org.apache.hadoop.fs.FileSystem;
>>>> import org.apache.hadoop.fs.FileStatus;
>>>> import org.apache.hadoop.conf.*;
>>>> import org.apache.hadoop.io.*;
>>>> import org.apache.hadoop.mapred.*;
>>>> import org.apache.hadoop.util.*;
>>>> import org.apache.hadoop.mapred.lib.*;
>>>>
>>>>  public class WordCount {
>>>>     public static class Map extends MapReduceBase implements
>>>> Mapper<LongWritable, Text, Text, IntWritable> {
>>>>      private final static IntWritable one = new IntWritable(1);
>>>>       private Text word = new Text();
>>>>       public void map(LongWritable key, Text value,
>>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>>> IOException {
>>>>    FSDataInputStream fs=null;
>>>>    FileSystem hdfs = null;
>>>>    String line = value.toString();
>>>>          int i=0,k=0;
>>>>   try{
>>>>    Configuration configuration = new Configuration();
>>>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>>>
>>>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>>>
>>>>    hdfs = FileSystem.get(configuration);
>>>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>>>    fs=hdfs.open(srcPath);
>>>>    BufferedReader br=new BufferedReader(new
>>>> InputStreamReader(hdfs.open(srcPath)));
>>>>
>>>> String[] splited = line.split("\\s+");
>>>>     for( i=0;i<splited.length;i++)
>>>>  {
>>>>      String sp[]=splited[i].split(",");
>>>>      for( k=0;k<sp.length;k++)
>>>>  {
>>>>
>>>>    if(!sp[k].isEmpty()){
>>>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>>>> if((sp[k].equalsIgnoreCase("C"))){
>>>>         while (tokenizer.hasMoreTokens()) {
>>>>           word.set(tokenizer.nextToken());
>>>>           output.collect(word, one);
>>>>         }
>>>> }
>>>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>>>         while (tokenizer.hasMoreTokens()) {
>>>>           word.set(tokenizer.nextToken());
>>>>           output.collect(word, one);
>>>>         }
>>>> }
>>>>       }
>>>>     }
>>>> }
>>>>  } catch (IOException e) {
>>>>     e.printStackTrace();
>>>>  }
>>>> }
>>>> }
>>>>     public static class Reduce extends MapReduceBase implements
>>>> Reducer<Text, IntWritable, Text, IntWritable> {
>>>>       public void reduce(Text key, Iterator<IntWritable> values,
>>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>>> IOException {
>>>>         int sum = 0;
>>>>         while (values.hasNext()) {
>>>>           sum += values.next().get();
>>>>         }
>>>>         output.collect(key, new IntWritable(sum));
>>>>       }
>>>>     }
>>>>     public static void main(String[] args) throws Exception {
>>>>
>>>>
>>>>       JobConf conf = new JobConf(WordCount.class);
>>>>       conf.setJobName("wordcount");
>>>>       conf.setOutputKeyClass(Text.class);
>>>>       conf.setOutputValueClass(IntWritable.class);
>>>>       conf.setMapperClass(Map.class);
>>>>       conf.setCombinerClass(Reduce.class);
>>>>       conf.setReducerClass(Reduce.class);
>>>>       conf.setInputFormat(TextInputFormat.class);
>>>>       conf.setOutputFormat(TextOutputFormat.class);
>>>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>>>       JobClient.runJob(conf);
>>>>     }
>>>>  }
>>>>
>>>>
>>>>
>>>> Please help
>>>>
>>>> Thanks in advance.
>>>>
>>>> Ranjini
>>>>
>>>>
>>>>
>>>
>>>
>>
>

Re: Need FileName with Content

Posted by Stanley Shi <ss...@gopivotal.com>.

Change you mapper to be something like this:

public static class TokenizerMapper extends

      Mapper<Object, Text, Text, IntWritable> {


    private final static IntWritable one = new IntWritable(1);

    private Text word = new Text();


    public void map(Object key, Text value, Context context)

        throws IOException, InterruptedException {

      Path pp = ((FileSplit) context.getInputSplit()).getPath();

      StringTokenizer itr = new StringTokenizer(value.toString());

      log.info("map on string: " + new String(value.getBytes()));

      while (itr.hasMoreTokens()) {

        word.set(pp.getName() + " " + itr.nextToken());

        context.write(word, one);

      }

    }

  }

Note: add your filtering code here;

and then when running the command, use you input path as param;

Regards,
*Stanley Shi,*



On Fri, Mar 21, 2014 at 9:32 AM, Stanley Shi <ss...@gopivotal.com> wrote:

> Just reviewed the code again, you are not really using map-reduce. you are
> reading all files in one map process, this is not a normal map-reduce job
> works.
>
>
> Regards,
> *Stanley Shi,*
>
>
>
> On Thu, Mar 20, 2014 at 1:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:
>
>> Hi,
>>
>> If we give the below code,
>> =======================
>> word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>> ======================
>>
>> The output is wrong. because it shows the
>>
>> filename   word   occurance
>> vinitha       java       4
>> vinitha         oracle      3
>> sony           java       4
>> sony          oracle      3
>>
>>
>> Here vinitha does not have oracle word . Similarlly sony does not have
>> java has word. File name is merging for  all words.
>>
>> I need the output has given below
>>
>>  filename   word   occurance
>>
>> vinitha       java       4
>> vinitha         C++    3
>> sony           ETL     4
>> sony          oracle      3
>>
>>
>>  Need fileaName along with the word in that particular file only. No
>> merge should happen.
>>
>> Please help me out for this issue.
>>
>> Please help.
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>>
>>
>>
>> On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam <
>> ranjinibecse@gmail.com> wrote:
>>
>>
>>>
>>> ---------- Forwarded message ----------
>>> From: Stanley Shi <ss...@gopivotal.com>
>>> Date: Thu, Mar 20, 2014 at 7:39 AM
>>> Subject: Re: Need FileName with Content
>>> To: user@hadoop.apache.org
>>>
>>>
>>> You want to do a word count for each file, but the code give you a word
>>> count for all the files, right?
>>>
>>> =====
>>>  word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>> ======
>>> change it to:
>>> word.set("filename"+"    "+tokenizer.nextToken());
>>> output.collect(word,one);
>>>
>>>
>>>
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>>
>>> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <
>>> ranjinibecse@gmail.com> wrote:
>>>
>>>> Hi,
>>>>
>>>> I have folder named INPUT.
>>>>
>>>> Inside INPUT i have 5 resume are there.
>>>>
>>>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>>>> Found 5 items
>>>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>>>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>>>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>>>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>>> /user/hduser/INPUT/vinitha.txt
>>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>>> /user/hduser/INPUT/sony.txt
>>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>>> /user/hduser/INPUT/ravi.txt
>>>> hduser@localhost:~/Ranjini$
>>>>
>>>> I have to process the folder and the content .
>>>>
>>>> I need ouput has
>>>>
>>>> filename   word   occurance
>>>> vinitha       java       4
>>>> sony          oracle      3
>>>>
>>>>
>>>>
>>>> But iam not getting the filename.  Has the input file content are
>>>> merged file name is not getting correct .
>>>>
>>>>
>>>> please help in this issue to fix.  I have given by code below
>>>>
>>>>
>>>>  import java.io.IOException;
>>>>  import java.util.*;
>>>>  import org.apache.hadoop.fs.Path;
>>>>  import org.apache.hadoop.conf.*;
>>>>  import org.apache.hadoop.io.*;
>>>>  import org.apache.hadoop.mapred.*;
>>>>  import org.apache.hadoop.util.*;
>>>> import java.io.File;
>>>> import java.io.FileReader;
>>>> import java.io.FileWriter;
>>>> import java.io.IOException;
>>>> import org.apache.hadoop.fs.Path;
>>>> import org.apache.hadoop.conf.Configuration;
>>>> import org.apache.hadoop.fs.FileSystem;
>>>> import org.apache.hadoop.fs.FileStatus;
>>>> import org.apache.hadoop.conf.*;
>>>> import org.apache.hadoop.io.*;
>>>> import org.apache.hadoop.mapred.*;
>>>> import org.apache.hadoop.util.*;
>>>> import org.apache.hadoop.mapred.lib.*;
>>>>
>>>>  public class WordCount {
>>>>     public static class Map extends MapReduceBase implements
>>>> Mapper<LongWritable, Text, Text, IntWritable> {
>>>>      private final static IntWritable one = new IntWritable(1);
>>>>       private Text word = new Text();
>>>>       public void map(LongWritable key, Text value,
>>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>>> IOException {
>>>>    FSDataInputStream fs=null;
>>>>    FileSystem hdfs = null;
>>>>    String line = value.toString();
>>>>          int i=0,k=0;
>>>>   try{
>>>>    Configuration configuration = new Configuration();
>>>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>>>
>>>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>>>
>>>>    hdfs = FileSystem.get(configuration);
>>>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>>>    fs=hdfs.open(srcPath);
>>>>    BufferedReader br=new BufferedReader(new
>>>> InputStreamReader(hdfs.open(srcPath)));
>>>>
>>>> String[] splited = line.split("\\s+");
>>>>     for( i=0;i<splited.length;i++)
>>>>  {
>>>>      String sp[]=splited[i].split(",");
>>>>      for( k=0;k<sp.length;k++)
>>>>  {
>>>>
>>>>    if(!sp[k].isEmpty()){
>>>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>>>> if((sp[k].equalsIgnoreCase("C"))){
>>>>         while (tokenizer.hasMoreTokens()) {
>>>>           word.set(tokenizer.nextToken());
>>>>           output.collect(word, one);
>>>>         }
>>>> }
>>>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>>>         while (tokenizer.hasMoreTokens()) {
>>>>           word.set(tokenizer.nextToken());
>>>>           output.collect(word, one);
>>>>         }
>>>> }
>>>>       }
>>>>     }
>>>> }
>>>>  } catch (IOException e) {
>>>>     e.printStackTrace();
>>>>  }
>>>> }
>>>> }
>>>>     public static class Reduce extends MapReduceBase implements
>>>> Reducer<Text, IntWritable, Text, IntWritable> {
>>>>       public void reduce(Text key, Iterator<IntWritable> values,
>>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>>> IOException {
>>>>         int sum = 0;
>>>>         while (values.hasNext()) {
>>>>           sum += values.next().get();
>>>>         }
>>>>         output.collect(key, new IntWritable(sum));
>>>>       }
>>>>     }
>>>>     public static void main(String[] args) throws Exception {
>>>>
>>>>
>>>>       JobConf conf = new JobConf(WordCount.class);
>>>>       conf.setJobName("wordcount");
>>>>       conf.setOutputKeyClass(Text.class);
>>>>       conf.setOutputValueClass(IntWritable.class);
>>>>       conf.setMapperClass(Map.class);
>>>>       conf.setCombinerClass(Reduce.class);
>>>>       conf.setReducerClass(Reduce.class);
>>>>       conf.setInputFormat(TextInputFormat.class);
>>>>       conf.setOutputFormat(TextOutputFormat.class);
>>>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>>>       JobClient.runJob(conf);
>>>>     }
>>>>  }
>>>>
>>>>
>>>>
>>>> Please help
>>>>
>>>> Thanks in advance.
>>>>
>>>> Ranjini
>>>>
>>>>
>>>>
>>>
>>>
>>
>

Re: Need FileName with Content

Posted by Stanley Shi <ss...@gopivotal.com>.

Change you mapper to be something like this:

public static class TokenizerMapper extends

      Mapper<Object, Text, Text, IntWritable> {


    private final static IntWritable one = new IntWritable(1);

    private Text word = new Text();


    public void map(Object key, Text value, Context context)

        throws IOException, InterruptedException {

      Path pp = ((FileSplit) context.getInputSplit()).getPath();

      StringTokenizer itr = new StringTokenizer(value.toString());

      log.info("map on string: " + new String(value.getBytes()));

      while (itr.hasMoreTokens()) {

        word.set(pp.getName() + " " + itr.nextToken());

        context.write(word, one);

      }

    }

  }

Note: add your filtering code here;

and then when running the command, use you input path as param;

Regards,
*Stanley Shi,*



On Fri, Mar 21, 2014 at 9:32 AM, Stanley Shi <ss...@gopivotal.com> wrote:

> Just reviewed the code again, you are not really using map-reduce. you are
> reading all files in one map process, this is not a normal map-reduce job
> works.
>
>
> Regards,
> *Stanley Shi,*
>
>
>
> On Thu, Mar 20, 2014 at 1:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:
>
>> Hi,
>>
>> If we give the below code,
>> =======================
>> word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>> ======================
>>
>> The output is wrong. because it shows the
>>
>> filename   word   occurance
>> vinitha       java       4
>> vinitha         oracle      3
>> sony           java       4
>> sony          oracle      3
>>
>>
>> Here vinitha does not have oracle word . Similarlly sony does not have
>> java has word. File name is merging for  all words.
>>
>> I need the output has given below
>>
>>  filename   word   occurance
>>
>> vinitha       java       4
>> vinitha         C++    3
>> sony           ETL     4
>> sony          oracle      3
>>
>>
>>  Need fileaName along with the word in that particular file only. No
>> merge should happen.
>>
>> Please help me out for this issue.
>>
>> Please help.
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>>
>>
>>
>> On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam <
>> ranjinibecse@gmail.com> wrote:
>>
>>
>>>
>>> ---------- Forwarded message ----------
>>> From: Stanley Shi <ss...@gopivotal.com>
>>> Date: Thu, Mar 20, 2014 at 7:39 AM
>>> Subject: Re: Need FileName with Content
>>> To: user@hadoop.apache.org
>>>
>>>
>>> You want to do a word count for each file, but the code give you a word
>>> count for all the files, right?
>>>
>>> =====
>>>  word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>> ======
>>> change it to:
>>> word.set("filename"+"    "+tokenizer.nextToken());
>>> output.collect(word,one);
>>>
>>>
>>>
>>>
>>>  Regards,
>>> *Stanley Shi,*
>>>
>>>
>>>
>>> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <
>>> ranjinibecse@gmail.com> wrote:
>>>
>>>> Hi,
>>>>
>>>> I have folder named INPUT.
>>>>
>>>> Inside INPUT i have 5 resume are there.
>>>>
>>>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>>>> Found 5 items
>>>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>>>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>>>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>>>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>>> /user/hduser/INPUT/vinitha.txt
>>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>>> /user/hduser/INPUT/sony.txt
>>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>>> /user/hduser/INPUT/ravi.txt
>>>> hduser@localhost:~/Ranjini$
>>>>
>>>> I have to process the folder and the content .
>>>>
>>>> I need ouput has
>>>>
>>>> filename   word   occurance
>>>> vinitha       java       4
>>>> sony          oracle      3
>>>>
>>>>
>>>>
>>>> But iam not getting the filename.  Has the input file content are
>>>> merged file name is not getting correct .
>>>>
>>>>
>>>> please help in this issue to fix.  I have given by code below
>>>>
>>>>
>>>>  import java.io.IOException;
>>>>  import java.util.*;
>>>>  import org.apache.hadoop.fs.Path;
>>>>  import org.apache.hadoop.conf.*;
>>>>  import org.apache.hadoop.io.*;
>>>>  import org.apache.hadoop.mapred.*;
>>>>  import org.apache.hadoop.util.*;
>>>> import java.io.File;
>>>> import java.io.FileReader;
>>>> import java.io.FileWriter;
>>>> import java.io.IOException;
>>>> import org.apache.hadoop.fs.Path;
>>>> import org.apache.hadoop.conf.Configuration;
>>>> import org.apache.hadoop.fs.FileSystem;
>>>> import org.apache.hadoop.fs.FileStatus;
>>>> import org.apache.hadoop.conf.*;
>>>> import org.apache.hadoop.io.*;
>>>> import org.apache.hadoop.mapred.*;
>>>> import org.apache.hadoop.util.*;
>>>> import org.apache.hadoop.mapred.lib.*;
>>>>
>>>>  public class WordCount {
>>>>     public static class Map extends MapReduceBase implements
>>>> Mapper<LongWritable, Text, Text, IntWritable> {
>>>>      private final static IntWritable one = new IntWritable(1);
>>>>       private Text word = new Text();
>>>>       public void map(LongWritable key, Text value,
>>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>>> IOException {
>>>>    FSDataInputStream fs=null;
>>>>    FileSystem hdfs = null;
>>>>    String line = value.toString();
>>>>          int i=0,k=0;
>>>>   try{
>>>>    Configuration configuration = new Configuration();
>>>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>>>
>>>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>>>
>>>>    hdfs = FileSystem.get(configuration);
>>>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>>>    fs=hdfs.open(srcPath);
>>>>    BufferedReader br=new BufferedReader(new
>>>> InputStreamReader(hdfs.open(srcPath)));
>>>>
>>>> String[] splited = line.split("\\s+");
>>>>     for( i=0;i<splited.length;i++)
>>>>  {
>>>>      String sp[]=splited[i].split(",");
>>>>      for( k=0;k<sp.length;k++)
>>>>  {
>>>>
>>>>    if(!sp[k].isEmpty()){
>>>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>>>> if((sp[k].equalsIgnoreCase("C"))){
>>>>         while (tokenizer.hasMoreTokens()) {
>>>>           word.set(tokenizer.nextToken());
>>>>           output.collect(word, one);
>>>>         }
>>>> }
>>>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>>>         while (tokenizer.hasMoreTokens()) {
>>>>           word.set(tokenizer.nextToken());
>>>>           output.collect(word, one);
>>>>         }
>>>> }
>>>>       }
>>>>     }
>>>> }
>>>>  } catch (IOException e) {
>>>>     e.printStackTrace();
>>>>  }
>>>> }
>>>> }
>>>>     public static class Reduce extends MapReduceBase implements
>>>> Reducer<Text, IntWritable, Text, IntWritable> {
>>>>       public void reduce(Text key, Iterator<IntWritable> values,
>>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>>> IOException {
>>>>         int sum = 0;
>>>>         while (values.hasNext()) {
>>>>           sum += values.next().get();
>>>>         }
>>>>         output.collect(key, new IntWritable(sum));
>>>>       }
>>>>     }
>>>>     public static void main(String[] args) throws Exception {
>>>>
>>>>
>>>>       JobConf conf = new JobConf(WordCount.class);
>>>>       conf.setJobName("wordcount");
>>>>       conf.setOutputKeyClass(Text.class);
>>>>       conf.setOutputValueClass(IntWritable.class);
>>>>       conf.setMapperClass(Map.class);
>>>>       conf.setCombinerClass(Reduce.class);
>>>>       conf.setReducerClass(Reduce.class);
>>>>       conf.setInputFormat(TextInputFormat.class);
>>>>       conf.setOutputFormat(TextOutputFormat.class);
>>>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>>>       JobClient.runJob(conf);
>>>>     }
>>>>  }
>>>>
>>>>
>>>>
>>>> Please help
>>>>
>>>> Thanks in advance.
>>>>
>>>> Ranjini
>>>>
>>>>
>>>>
>>>
>>>
>>
>

Re: Need FileName with Content

Posted by Stanley Shi <ss...@gopivotal.com>.

Just reviewed the code again, you are not really using map-reduce. you are
reading all files in one map process, this is not a normal map-reduce job
works.


Regards,
*Stanley Shi,*



On Thu, Mar 20, 2014 at 1:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:

> Hi,
>
> If we give the below code,
> =======================
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> ======================
>
> The output is wrong. because it shows the
>
> filename   word   occurance
> vinitha       java       4
> vinitha         oracle      3
> sony           java       4
> sony          oracle      3
>
>
> Here vinitha does not have oracle word . Similarlly sony does not have
> java has word. File name is merging for  all words.
>
> I need the output has given below
>
>  filename   word   occurance
>
> vinitha       java       4
> vinitha         C++    3
> sony           ETL     4
> sony          oracle      3
>
>
>  Need fileaName along with the word in that particular file only. No merge
> should happen.
>
> Please help me out for this issue.
>
> Please help.
>
> Thanks in advance.
>
> Ranjini
>
>
>
>
> On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam <ranjinibecse@gmail.com
> > wrote:
>
>
>>
>> ---------- Forwarded message ----------
>> From: Stanley Shi <ss...@gopivotal.com>
>> Date: Thu, Mar 20, 2014 at 7:39 AM
>> Subject: Re: Need FileName with Content
>> To: user@hadoop.apache.org
>>
>>
>> You want to do a word count for each file, but the code give you a word
>> count for all the files, right?
>>
>> =====
>>  word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>> ======
>> change it to:
>> word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>>
>>
>>
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>>
>> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ranjinibecse@gmail.com
>> > wrote:
>>
>>> Hi,
>>>
>>> I have folder named INPUT.
>>>
>>> Inside INPUT i have 5 resume are there.
>>>
>>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>>> Found 5 items
>>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/vinitha.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/sony.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/ravi.txt
>>> hduser@localhost:~/Ranjini$
>>>
>>> I have to process the folder and the content .
>>>
>>> I need ouput has
>>>
>>> filename   word   occurance
>>> vinitha       java       4
>>> sony          oracle      3
>>>
>>>
>>>
>>> But iam not getting the filename.  Has the input file content are merged
>>> file name is not getting correct .
>>>
>>>
>>> please help in this issue to fix.  I have given by code below
>>>
>>>
>>>  import java.io.IOException;
>>>  import java.util.*;
>>>  import org.apache.hadoop.fs.Path;
>>>  import org.apache.hadoop.conf.*;
>>>  import org.apache.hadoop.io.*;
>>>  import org.apache.hadoop.mapred.*;
>>>  import org.apache.hadoop.util.*;
>>> import java.io.File;
>>> import java.io.FileReader;
>>> import java.io.FileWriter;
>>> import java.io.IOException;
>>> import org.apache.hadoop.fs.Path;
>>> import org.apache.hadoop.conf.Configuration;
>>> import org.apache.hadoop.fs.FileSystem;
>>> import org.apache.hadoop.fs.FileStatus;
>>> import org.apache.hadoop.conf.*;
>>> import org.apache.hadoop.io.*;
>>> import org.apache.hadoop.mapred.*;
>>> import org.apache.hadoop.util.*;
>>> import org.apache.hadoop.mapred.lib.*;
>>>
>>>  public class WordCount {
>>>     public static class Map extends MapReduceBase implements
>>> Mapper<LongWritable, Text, Text, IntWritable> {
>>>      private final static IntWritable one = new IntWritable(1);
>>>       private Text word = new Text();
>>>       public void map(LongWritable key, Text value,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>    FSDataInputStream fs=null;
>>>    FileSystem hdfs = null;
>>>    String line = value.toString();
>>>          int i=0,k=0;
>>>   try{
>>>    Configuration configuration = new Configuration();
>>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>>
>>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>>
>>>    hdfs = FileSystem.get(configuration);
>>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>>    fs=hdfs.open(srcPath);
>>>    BufferedReader br=new BufferedReader(new
>>> InputStreamReader(hdfs.open(srcPath)));
>>>
>>> String[] splited = line.split("\\s+");
>>>     for( i=0;i<splited.length;i++)
>>>  {
>>>      String sp[]=splited[i].split(",");
>>>      for( k=0;k<sp.length;k++)
>>>  {
>>>
>>>    if(!sp[k].isEmpty()){
>>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>>> if((sp[k].equalsIgnoreCase("C"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>>       }
>>>     }
>>> }
>>>  } catch (IOException e) {
>>>     e.printStackTrace();
>>>  }
>>> }
>>> }
>>>     public static class Reduce extends MapReduceBase implements
>>> Reducer<Text, IntWritable, Text, IntWritable> {
>>>       public void reduce(Text key, Iterator<IntWritable> values,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>         int sum = 0;
>>>         while (values.hasNext()) {
>>>           sum += values.next().get();
>>>         }
>>>         output.collect(key, new IntWritable(sum));
>>>       }
>>>     }
>>>     public static void main(String[] args) throws Exception {
>>>
>>>
>>>       JobConf conf = new JobConf(WordCount.class);
>>>       conf.setJobName("wordcount");
>>>       conf.setOutputKeyClass(Text.class);
>>>       conf.setOutputValueClass(IntWritable.class);
>>>       conf.setMapperClass(Map.class);
>>>       conf.setCombinerClass(Reduce.class);
>>>       conf.setReducerClass(Reduce.class);
>>>       conf.setInputFormat(TextInputFormat.class);
>>>       conf.setOutputFormat(TextOutputFormat.class);
>>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>>       JobClient.runJob(conf);
>>>     }
>>>  }
>>>
>>>
>>>
>>> Please help
>>>
>>> Thanks in advance.
>>>
>>> Ranjini
>>>
>>>
>>>
>>
>>
>

Re: Need FileName with Content

Posted by Felix Chern <id...@gmail.com>.

I've written two blog post of how to get directory context in hadoop mapper.

http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/

Cheers,
Felix

On Mar 19, 2014, at 10:50 PM, Ranjini Rathinam <ra...@gmail.com> wrote:

> Hi,
>  
> If we give the below code,
> =======================
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> ======================
>  
> The output is wrong. because it shows the
>  
> filename   word   occurance
> vinitha       java       4
> vinitha         oracle      3
> sony           java       4
> sony          oracle      3
>  
>  
> Here vinitha does not have oracle word . Similarlly sony does not have java has word. File name is merging for  all words.
>  
> I need the output has given below
>  
> filename   word   occurance
> 
> vinitha       java       4
> vinitha         C++    3
> sony           ETL     4
> sony          oracle      3
>  
>  
>  Need fileaName along with the word in that particular file only. No merge should happen.
>  
> Please help me out for this issue.
>  
> Please help.
>  
> Thanks in advance.
>  
> Ranjini
>  
>  
> 
>  
> On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam <ra...@gmail.com> wrote:
> 
> 
> ---------- Forwarded message ----------
> From: Stanley Shi <ss...@gopivotal.com>
> Date: Thu, Mar 20, 2014 at 7:39 AM
> Subject: Re: Need FileName with Content
> To: user@hadoop.apache.org
> 
> 
> You want to do a word count for each file, but the code give you a word count for all the files, right?
> 
> =====
> word.set(tokenizer.nextToken());
>           output.collect(word, one);
> ======
> change it to:
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> 
> 
> 
> 
> Regards,
> Stanley Shi,
> 
> 
> 
> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ra...@gmail.com> wrote:
> Hi,
> 
> I have folder named INPUT.
> 
> Inside INPUT i have 5 resume are there.
> 
> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
> Found 5 items
> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20 /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22 /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21 /user/hduser/INPUT/vinitha.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21 /user/hduser/INPUT/sony.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21 /user/hduser/INPUT/ravi.txt
> hduser@localhost:~/Ranjini$ 
> 
> I have to process the folder and the content .
> 
> I need ouput has 
> 
> filename   word   occurance
> vinitha       java       4
> sony          oracle      3
> 
> 
> 
> But iam not getting the filename.  Has the input file content are merged file name is not getting correct .
> 
> 
> please help in this issue to fix.  I have given by code below
>  
>  
>  import java.io.IOException;
>  import java.util.*;
>  import org.apache.hadoop.fs.Path;
>  import org.apache.hadoop.conf.*;
>  import org.apache.hadoop.io.*;
>  import org.apache.hadoop.mapred.*;
>  import org.apache.hadoop.util.*;
> import java.io.File;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapred.lib.*;
> 
>  public class WordCount {
>     public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
>      private final static IntWritable one = new IntWritable(1);
>       private Text word = new Text();
>       public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
>    FSDataInputStream fs=null;
>    FileSystem hdfs = null;
>    String line = value.toString();
>          int i=0,k=0;
>   try{
>    Configuration configuration = new Configuration();
>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>    
>    Path srcPath = new Path("/user/hduser/INPUT/");
>    
>    hdfs = FileSystem.get(configuration);
>    FileStatus[] status = hdfs.listStatus(srcPath);
>    fs=hdfs.open(srcPath);
>    BufferedReader br=new BufferedReader(new InputStreamReader(hdfs.open(srcPath)));
>    
> String[] splited = line.split("\\s+");
>     for( i=0;i<splited.length;i++)
>  {
>      String sp[]=splited[i].split(",");
>      for( k=0;k<sp.length;k++)
>  {
>      
>    if(!sp[k].isEmpty()){
> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
> if((sp[k].equalsIgnoreCase("C"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
> if((sp[k].equalsIgnoreCase("JAVA"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
>       }
>     }
> }
>  } catch (IOException e) {
>     e.printStackTrace();
>  } 
> }
> }
>     public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
>       public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>         output.collect(key, new IntWritable(sum));
>       }
>     }
>     public static void main(String[] args) throws Exception {
>  
>  
>       JobConf conf = new JobConf(WordCount.class);
>       conf.setJobName("wordcount");
>       conf.setOutputKeyClass(Text.class);
>       conf.setOutputValueClass(IntWritable.class);
>       conf.setMapperClass(Map.class);
>       conf.setCombinerClass(Reduce.class);
>       conf.setReducerClass(Reduce.class);
>       conf.setInputFormat(TextInputFormat.class);
>       conf.setOutputFormat(TextOutputFormat.class);
>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>       JobClient.runJob(conf);
>     }
>  }
>  
> Please help
>  
> Thanks in advance.
>  
> Ranjini
> 
> 
> 
> 
>

Re: Need FileName with Content

Posted by Stanley Shi <ss...@gopivotal.com>.

Just reviewed the code again, you are not really using map-reduce. you are
reading all files in one map process, this is not a normal map-reduce job
works.


Regards,
*Stanley Shi,*



On Thu, Mar 20, 2014 at 1:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:

> Hi,
>
> If we give the below code,
> =======================
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> ======================
>
> The output is wrong. because it shows the
>
> filename   word   occurance
> vinitha       java       4
> vinitha         oracle      3
> sony           java       4
> sony          oracle      3
>
>
> Here vinitha does not have oracle word . Similarlly sony does not have
> java has word. File name is merging for  all words.
>
> I need the output has given below
>
>  filename   word   occurance
>
> vinitha       java       4
> vinitha         C++    3
> sony           ETL     4
> sony          oracle      3
>
>
>  Need fileaName along with the word in that particular file only. No merge
> should happen.
>
> Please help me out for this issue.
>
> Please help.
>
> Thanks in advance.
>
> Ranjini
>
>
>
>
> On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam <ranjinibecse@gmail.com
> > wrote:
>
>
>>
>> ---------- Forwarded message ----------
>> From: Stanley Shi <ss...@gopivotal.com>
>> Date: Thu, Mar 20, 2014 at 7:39 AM
>> Subject: Re: Need FileName with Content
>> To: user@hadoop.apache.org
>>
>>
>> You want to do a word count for each file, but the code give you a word
>> count for all the files, right?
>>
>> =====
>>  word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>> ======
>> change it to:
>> word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>>
>>
>>
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>>
>> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ranjinibecse@gmail.com
>> > wrote:
>>
>>> Hi,
>>>
>>> I have folder named INPUT.
>>>
>>> Inside INPUT i have 5 resume are there.
>>>
>>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>>> Found 5 items
>>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/vinitha.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/sony.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/ravi.txt
>>> hduser@localhost:~/Ranjini$
>>>
>>> I have to process the folder and the content .
>>>
>>> I need ouput has
>>>
>>> filename   word   occurance
>>> vinitha       java       4
>>> sony          oracle      3
>>>
>>>
>>>
>>> But iam not getting the filename.  Has the input file content are merged
>>> file name is not getting correct .
>>>
>>>
>>> please help in this issue to fix.  I have given by code below
>>>
>>>
>>>  import java.io.IOException;
>>>  import java.util.*;
>>>  import org.apache.hadoop.fs.Path;
>>>  import org.apache.hadoop.conf.*;
>>>  import org.apache.hadoop.io.*;
>>>  import org.apache.hadoop.mapred.*;
>>>  import org.apache.hadoop.util.*;
>>> import java.io.File;
>>> import java.io.FileReader;
>>> import java.io.FileWriter;
>>> import java.io.IOException;
>>> import org.apache.hadoop.fs.Path;
>>> import org.apache.hadoop.conf.Configuration;
>>> import org.apache.hadoop.fs.FileSystem;
>>> import org.apache.hadoop.fs.FileStatus;
>>> import org.apache.hadoop.conf.*;
>>> import org.apache.hadoop.io.*;
>>> import org.apache.hadoop.mapred.*;
>>> import org.apache.hadoop.util.*;
>>> import org.apache.hadoop.mapred.lib.*;
>>>
>>>  public class WordCount {
>>>     public static class Map extends MapReduceBase implements
>>> Mapper<LongWritable, Text, Text, IntWritable> {
>>>      private final static IntWritable one = new IntWritable(1);
>>>       private Text word = new Text();
>>>       public void map(LongWritable key, Text value,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>    FSDataInputStream fs=null;
>>>    FileSystem hdfs = null;
>>>    String line = value.toString();
>>>          int i=0,k=0;
>>>   try{
>>>    Configuration configuration = new Configuration();
>>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>>
>>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>>
>>>    hdfs = FileSystem.get(configuration);
>>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>>    fs=hdfs.open(srcPath);
>>>    BufferedReader br=new BufferedReader(new
>>> InputStreamReader(hdfs.open(srcPath)));
>>>
>>> String[] splited = line.split("\\s+");
>>>     for( i=0;i<splited.length;i++)
>>>  {
>>>      String sp[]=splited[i].split(",");
>>>      for( k=0;k<sp.length;k++)
>>>  {
>>>
>>>    if(!sp[k].isEmpty()){
>>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>>> if((sp[k].equalsIgnoreCase("C"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>>       }
>>>     }
>>> }
>>>  } catch (IOException e) {
>>>     e.printStackTrace();
>>>  }
>>> }
>>> }
>>>     public static class Reduce extends MapReduceBase implements
>>> Reducer<Text, IntWritable, Text, IntWritable> {
>>>       public void reduce(Text key, Iterator<IntWritable> values,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>         int sum = 0;
>>>         while (values.hasNext()) {
>>>           sum += values.next().get();
>>>         }
>>>         output.collect(key, new IntWritable(sum));
>>>       }
>>>     }
>>>     public static void main(String[] args) throws Exception {
>>>
>>>
>>>       JobConf conf = new JobConf(WordCount.class);
>>>       conf.setJobName("wordcount");
>>>       conf.setOutputKeyClass(Text.class);
>>>       conf.setOutputValueClass(IntWritable.class);
>>>       conf.setMapperClass(Map.class);
>>>       conf.setCombinerClass(Reduce.class);
>>>       conf.setReducerClass(Reduce.class);
>>>       conf.setInputFormat(TextInputFormat.class);
>>>       conf.setOutputFormat(TextOutputFormat.class);
>>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>>       JobClient.runJob(conf);
>>>     }
>>>  }
>>>
>>>
>>>
>>> Please help
>>>
>>> Thanks in advance.
>>>
>>> Ranjini
>>>
>>>
>>>
>>
>>
>

Re: Need FileName with Content

Posted by Felix Chern <id...@gmail.com>.

I've written two blog post of how to get directory context in hadoop mapper.

http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/

Cheers,
Felix

On Mar 19, 2014, at 10:50 PM, Ranjini Rathinam <ra...@gmail.com> wrote:

> Hi,
>  
> If we give the below code,
> =======================
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> ======================
>  
> The output is wrong. because it shows the
>  
> filename   word   occurance
> vinitha       java       4
> vinitha         oracle      3
> sony           java       4
> sony          oracle      3
>  
>  
> Here vinitha does not have oracle word . Similarlly sony does not have java has word. File name is merging for  all words.
>  
> I need the output has given below
>  
> filename   word   occurance
> 
> vinitha       java       4
> vinitha         C++    3
> sony           ETL     4
> sony          oracle      3
>  
>  
>  Need fileaName along with the word in that particular file only. No merge should happen.
>  
> Please help me out for this issue.
>  
> Please help.
>  
> Thanks in advance.
>  
> Ranjini
>  
>  
> 
>  
> On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam <ra...@gmail.com> wrote:
> 
> 
> ---------- Forwarded message ----------
> From: Stanley Shi <ss...@gopivotal.com>
> Date: Thu, Mar 20, 2014 at 7:39 AM
> Subject: Re: Need FileName with Content
> To: user@hadoop.apache.org
> 
> 
> You want to do a word count for each file, but the code give you a word count for all the files, right?
> 
> =====
> word.set(tokenizer.nextToken());
>           output.collect(word, one);
> ======
> change it to:
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> 
> 
> 
> 
> Regards,
> Stanley Shi,
> 
> 
> 
> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ra...@gmail.com> wrote:
> Hi,
> 
> I have folder named INPUT.
> 
> Inside INPUT i have 5 resume are there.
> 
> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
> Found 5 items
> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20 /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22 /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21 /user/hduser/INPUT/vinitha.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21 /user/hduser/INPUT/sony.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21 /user/hduser/INPUT/ravi.txt
> hduser@localhost:~/Ranjini$ 
> 
> I have to process the folder and the content .
> 
> I need ouput has 
> 
> filename   word   occurance
> vinitha       java       4
> sony          oracle      3
> 
> 
> 
> But iam not getting the filename.  Has the input file content are merged file name is not getting correct .
> 
> 
> please help in this issue to fix.  I have given by code below
>  
>  
>  import java.io.IOException;
>  import java.util.*;
>  import org.apache.hadoop.fs.Path;
>  import org.apache.hadoop.conf.*;
>  import org.apache.hadoop.io.*;
>  import org.apache.hadoop.mapred.*;
>  import org.apache.hadoop.util.*;
> import java.io.File;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapred.lib.*;
> 
>  public class WordCount {
>     public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
>      private final static IntWritable one = new IntWritable(1);
>       private Text word = new Text();
>       public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
>    FSDataInputStream fs=null;
>    FileSystem hdfs = null;
>    String line = value.toString();
>          int i=0,k=0;
>   try{
>    Configuration configuration = new Configuration();
>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>    
>    Path srcPath = new Path("/user/hduser/INPUT/");
>    
>    hdfs = FileSystem.get(configuration);
>    FileStatus[] status = hdfs.listStatus(srcPath);
>    fs=hdfs.open(srcPath);
>    BufferedReader br=new BufferedReader(new InputStreamReader(hdfs.open(srcPath)));
>    
> String[] splited = line.split("\\s+");
>     for( i=0;i<splited.length;i++)
>  {
>      String sp[]=splited[i].split(",");
>      for( k=0;k<sp.length;k++)
>  {
>      
>    if(!sp[k].isEmpty()){
> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
> if((sp[k].equalsIgnoreCase("C"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
> if((sp[k].equalsIgnoreCase("JAVA"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
>       }
>     }
> }
>  } catch (IOException e) {
>     e.printStackTrace();
>  } 
> }
> }
>     public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
>       public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>         output.collect(key, new IntWritable(sum));
>       }
>     }
>     public static void main(String[] args) throws Exception {
>  
>  
>       JobConf conf = new JobConf(WordCount.class);
>       conf.setJobName("wordcount");
>       conf.setOutputKeyClass(Text.class);
>       conf.setOutputValueClass(IntWritable.class);
>       conf.setMapperClass(Map.class);
>       conf.setCombinerClass(Reduce.class);
>       conf.setReducerClass(Reduce.class);
>       conf.setInputFormat(TextInputFormat.class);
>       conf.setOutputFormat(TextOutputFormat.class);
>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>       JobClient.runJob(conf);
>     }
>  }
>  
> Please help
>  
> Thanks in advance.
>  
> Ranjini
> 
> 
> 
> 
>

Re: Need FileName with Content

Posted by Felix Chern <id...@gmail.com>.

I've written two blog post of how to get directory context in hadoop mapper.

http://www.idryman.org/blog/2014/01/26/capture-directory-context-in-hadoop-mapper/
http://www.idryman.org/blog/2014/01/27/capture-path-info-in-hadoop-inputformat-class/

Cheers,
Felix

On Mar 19, 2014, at 10:50 PM, Ranjini Rathinam <ra...@gmail.com> wrote:

> Hi,
>  
> If we give the below code,
> =======================
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> ======================
>  
> The output is wrong. because it shows the
>  
> filename   word   occurance
> vinitha       java       4
> vinitha         oracle      3
> sony           java       4
> sony          oracle      3
>  
>  
> Here vinitha does not have oracle word . Similarlly sony does not have java has word. File name is merging for  all words.
>  
> I need the output has given below
>  
> filename   word   occurance
> 
> vinitha       java       4
> vinitha         C++    3
> sony           ETL     4
> sony          oracle      3
>  
>  
>  Need fileaName along with the word in that particular file only. No merge should happen.
>  
> Please help me out for this issue.
>  
> Please help.
>  
> Thanks in advance.
>  
> Ranjini
>  
>  
> 
>  
> On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam <ra...@gmail.com> wrote:
> 
> 
> ---------- Forwarded message ----------
> From: Stanley Shi <ss...@gopivotal.com>
> Date: Thu, Mar 20, 2014 at 7:39 AM
> Subject: Re: Need FileName with Content
> To: user@hadoop.apache.org
> 
> 
> You want to do a word count for each file, but the code give you a word count for all the files, right?
> 
> =====
> word.set(tokenizer.nextToken());
>           output.collect(word, one);
> ======
> change it to:
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> 
> 
> 
> 
> Regards,
> Stanley Shi,
> 
> 
> 
> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ra...@gmail.com> wrote:
> Hi,
> 
> I have folder named INPUT.
> 
> Inside INPUT i have 5 resume are there.
> 
> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
> Found 5 items
> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20 /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22 /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21 /user/hduser/INPUT/vinitha.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21 /user/hduser/INPUT/sony.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21 /user/hduser/INPUT/ravi.txt
> hduser@localhost:~/Ranjini$ 
> 
> I have to process the folder and the content .
> 
> I need ouput has 
> 
> filename   word   occurance
> vinitha       java       4
> sony          oracle      3
> 
> 
> 
> But iam not getting the filename.  Has the input file content are merged file name is not getting correct .
> 
> 
> please help in this issue to fix.  I have given by code below
>  
>  
>  import java.io.IOException;
>  import java.util.*;
>  import org.apache.hadoop.fs.Path;
>  import org.apache.hadoop.conf.*;
>  import org.apache.hadoop.io.*;
>  import org.apache.hadoop.mapred.*;
>  import org.apache.hadoop.util.*;
> import java.io.File;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapred.lib.*;
> 
>  public class WordCount {
>     public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
>      private final static IntWritable one = new IntWritable(1);
>       private Text word = new Text();
>       public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
>    FSDataInputStream fs=null;
>    FileSystem hdfs = null;
>    String line = value.toString();
>          int i=0,k=0;
>   try{
>    Configuration configuration = new Configuration();
>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>    
>    Path srcPath = new Path("/user/hduser/INPUT/");
>    
>    hdfs = FileSystem.get(configuration);
>    FileStatus[] status = hdfs.listStatus(srcPath);
>    fs=hdfs.open(srcPath);
>    BufferedReader br=new BufferedReader(new InputStreamReader(hdfs.open(srcPath)));
>    
> String[] splited = line.split("\\s+");
>     for( i=0;i<splited.length;i++)
>  {
>      String sp[]=splited[i].split(",");
>      for( k=0;k<sp.length;k++)
>  {
>      
>    if(!sp[k].isEmpty()){
> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
> if((sp[k].equalsIgnoreCase("C"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
> if((sp[k].equalsIgnoreCase("JAVA"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
>       }
>     }
> }
>  } catch (IOException e) {
>     e.printStackTrace();
>  } 
> }
> }
>     public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
>       public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>         output.collect(key, new IntWritable(sum));
>       }
>     }
>     public static void main(String[] args) throws Exception {
>  
>  
>       JobConf conf = new JobConf(WordCount.class);
>       conf.setJobName("wordcount");
>       conf.setOutputKeyClass(Text.class);
>       conf.setOutputValueClass(IntWritable.class);
>       conf.setMapperClass(Map.class);
>       conf.setCombinerClass(Reduce.class);
>       conf.setReducerClass(Reduce.class);
>       conf.setInputFormat(TextInputFormat.class);
>       conf.setOutputFormat(TextOutputFormat.class);
>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>       JobClient.runJob(conf);
>     }
>  }
>  
> Please help
>  
> Thanks in advance.
>  
> Ranjini
> 
> 
> 
> 
>

Re: Need FileName with Content

Posted by Stanley Shi <ss...@gopivotal.com>.

Just reviewed the code again, you are not really using map-reduce. you are
reading all files in one map process, this is not a normal map-reduce job
works.


Regards,
*Stanley Shi,*



On Thu, Mar 20, 2014 at 1:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:

> Hi,
>
> If we give the below code,
> =======================
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> ======================
>
> The output is wrong. because it shows the
>
> filename   word   occurance
> vinitha       java       4
> vinitha         oracle      3
> sony           java       4
> sony          oracle      3
>
>
> Here vinitha does not have oracle word . Similarlly sony does not have
> java has word. File name is merging for  all words.
>
> I need the output has given below
>
>  filename   word   occurance
>
> vinitha       java       4
> vinitha         C++    3
> sony           ETL     4
> sony          oracle      3
>
>
>  Need fileaName along with the word in that particular file only. No merge
> should happen.
>
> Please help me out for this issue.
>
> Please help.
>
> Thanks in advance.
>
> Ranjini
>
>
>
>
> On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam <ranjinibecse@gmail.com
> > wrote:
>
>
>>
>> ---------- Forwarded message ----------
>> From: Stanley Shi <ss...@gopivotal.com>
>> Date: Thu, Mar 20, 2014 at 7:39 AM
>> Subject: Re: Need FileName with Content
>> To: user@hadoop.apache.org
>>
>>
>> You want to do a word count for each file, but the code give you a word
>> count for all the files, right?
>>
>> =====
>>  word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>> ======
>> change it to:
>> word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>>
>>
>>
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>>
>> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ranjinibecse@gmail.com
>> > wrote:
>>
>>> Hi,
>>>
>>> I have folder named INPUT.
>>>
>>> Inside INPUT i have 5 resume are there.
>>>
>>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>>> Found 5 items
>>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/vinitha.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/sony.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/ravi.txt
>>> hduser@localhost:~/Ranjini$
>>>
>>> I have to process the folder and the content .
>>>
>>> I need ouput has
>>>
>>> filename   word   occurance
>>> vinitha       java       4
>>> sony          oracle      3
>>>
>>>
>>>
>>> But iam not getting the filename.  Has the input file content are merged
>>> file name is not getting correct .
>>>
>>>
>>> please help in this issue to fix.  I have given by code below
>>>
>>>
>>>  import java.io.IOException;
>>>  import java.util.*;
>>>  import org.apache.hadoop.fs.Path;
>>>  import org.apache.hadoop.conf.*;
>>>  import org.apache.hadoop.io.*;
>>>  import org.apache.hadoop.mapred.*;
>>>  import org.apache.hadoop.util.*;
>>> import java.io.File;
>>> import java.io.FileReader;
>>> import java.io.FileWriter;
>>> import java.io.IOException;
>>> import org.apache.hadoop.fs.Path;
>>> import org.apache.hadoop.conf.Configuration;
>>> import org.apache.hadoop.fs.FileSystem;
>>> import org.apache.hadoop.fs.FileStatus;
>>> import org.apache.hadoop.conf.*;
>>> import org.apache.hadoop.io.*;
>>> import org.apache.hadoop.mapred.*;
>>> import org.apache.hadoop.util.*;
>>> import org.apache.hadoop.mapred.lib.*;
>>>
>>>  public class WordCount {
>>>     public static class Map extends MapReduceBase implements
>>> Mapper<LongWritable, Text, Text, IntWritable> {
>>>      private final static IntWritable one = new IntWritable(1);
>>>       private Text word = new Text();
>>>       public void map(LongWritable key, Text value,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>    FSDataInputStream fs=null;
>>>    FileSystem hdfs = null;
>>>    String line = value.toString();
>>>          int i=0,k=0;
>>>   try{
>>>    Configuration configuration = new Configuration();
>>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>>
>>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>>
>>>    hdfs = FileSystem.get(configuration);
>>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>>    fs=hdfs.open(srcPath);
>>>    BufferedReader br=new BufferedReader(new
>>> InputStreamReader(hdfs.open(srcPath)));
>>>
>>> String[] splited = line.split("\\s+");
>>>     for( i=0;i<splited.length;i++)
>>>  {
>>>      String sp[]=splited[i].split(",");
>>>      for( k=0;k<sp.length;k++)
>>>  {
>>>
>>>    if(!sp[k].isEmpty()){
>>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>>> if((sp[k].equalsIgnoreCase("C"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>>       }
>>>     }
>>> }
>>>  } catch (IOException e) {
>>>     e.printStackTrace();
>>>  }
>>> }
>>> }
>>>     public static class Reduce extends MapReduceBase implements
>>> Reducer<Text, IntWritable, Text, IntWritable> {
>>>       public void reduce(Text key, Iterator<IntWritable> values,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>         int sum = 0;
>>>         while (values.hasNext()) {
>>>           sum += values.next().get();
>>>         }
>>>         output.collect(key, new IntWritable(sum));
>>>       }
>>>     }
>>>     public static void main(String[] args) throws Exception {
>>>
>>>
>>>       JobConf conf = new JobConf(WordCount.class);
>>>       conf.setJobName("wordcount");
>>>       conf.setOutputKeyClass(Text.class);
>>>       conf.setOutputValueClass(IntWritable.class);
>>>       conf.setMapperClass(Map.class);
>>>       conf.setCombinerClass(Reduce.class);
>>>       conf.setReducerClass(Reduce.class);
>>>       conf.setInputFormat(TextInputFormat.class);
>>>       conf.setOutputFormat(TextOutputFormat.class);
>>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>>       JobClient.runJob(conf);
>>>     }
>>>  }
>>>
>>>
>>>
>>> Please help
>>>
>>> Thanks in advance.
>>>
>>> Ranjini
>>>
>>>
>>>
>>
>>
>

Re: Need FileName with Content

Posted by Stanley Shi <ss...@gopivotal.com>.

Just reviewed the code again, you are not really using map-reduce. you are
reading all files in one map process, this is not a normal map-reduce job
works.


Regards,
*Stanley Shi,*



On Thu, Mar 20, 2014 at 1:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:

> Hi,
>
> If we give the below code,
> =======================
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
> ======================
>
> The output is wrong. because it shows the
>
> filename   word   occurance
> vinitha       java       4
> vinitha         oracle      3
> sony           java       4
> sony          oracle      3
>
>
> Here vinitha does not have oracle word . Similarlly sony does not have
> java has word. File name is merging for  all words.
>
> I need the output has given below
>
>  filename   word   occurance
>
> vinitha       java       4
> vinitha         C++    3
> sony           ETL     4
> sony          oracle      3
>
>
>  Need fileaName along with the word in that particular file only. No merge
> should happen.
>
> Please help me out for this issue.
>
> Please help.
>
> Thanks in advance.
>
> Ranjini
>
>
>
>
> On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam <ranjinibecse@gmail.com
> > wrote:
>
>
>>
>> ---------- Forwarded message ----------
>> From: Stanley Shi <ss...@gopivotal.com>
>> Date: Thu, Mar 20, 2014 at 7:39 AM
>> Subject: Re: Need FileName with Content
>> To: user@hadoop.apache.org
>>
>>
>> You want to do a word count for each file, but the code give you a word
>> count for all the files, right?
>>
>> =====
>>  word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>> ======
>> change it to:
>> word.set("filename"+"    "+tokenizer.nextToken());
>> output.collect(word,one);
>>
>>
>>
>>
>>  Regards,
>> *Stanley Shi,*
>>
>>
>>
>> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ranjinibecse@gmail.com
>> > wrote:
>>
>>> Hi,
>>>
>>> I have folder named INPUT.
>>>
>>> Inside INPUT i have 5 resume are there.
>>>
>>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>>> Found 5 items
>>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/vinitha.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/sony.txt
>>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>>> /user/hduser/INPUT/ravi.txt
>>> hduser@localhost:~/Ranjini$
>>>
>>> I have to process the folder and the content .
>>>
>>> I need ouput has
>>>
>>> filename   word   occurance
>>> vinitha       java       4
>>> sony          oracle      3
>>>
>>>
>>>
>>> But iam not getting the filename.  Has the input file content are merged
>>> file name is not getting correct .
>>>
>>>
>>> please help in this issue to fix.  I have given by code below
>>>
>>>
>>>  import java.io.IOException;
>>>  import java.util.*;
>>>  import org.apache.hadoop.fs.Path;
>>>  import org.apache.hadoop.conf.*;
>>>  import org.apache.hadoop.io.*;
>>>  import org.apache.hadoop.mapred.*;
>>>  import org.apache.hadoop.util.*;
>>> import java.io.File;
>>> import java.io.FileReader;
>>> import java.io.FileWriter;
>>> import java.io.IOException;
>>> import org.apache.hadoop.fs.Path;
>>> import org.apache.hadoop.conf.Configuration;
>>> import org.apache.hadoop.fs.FileSystem;
>>> import org.apache.hadoop.fs.FileStatus;
>>> import org.apache.hadoop.conf.*;
>>> import org.apache.hadoop.io.*;
>>> import org.apache.hadoop.mapred.*;
>>> import org.apache.hadoop.util.*;
>>> import org.apache.hadoop.mapred.lib.*;
>>>
>>>  public class WordCount {
>>>     public static class Map extends MapReduceBase implements
>>> Mapper<LongWritable, Text, Text, IntWritable> {
>>>      private final static IntWritable one = new IntWritable(1);
>>>       private Text word = new Text();
>>>       public void map(LongWritable key, Text value,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>    FSDataInputStream fs=null;
>>>    FileSystem hdfs = null;
>>>    String line = value.toString();
>>>          int i=0,k=0;
>>>   try{
>>>    Configuration configuration = new Configuration();
>>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>>
>>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>>
>>>    hdfs = FileSystem.get(configuration);
>>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>>    fs=hdfs.open(srcPath);
>>>    BufferedReader br=new BufferedReader(new
>>> InputStreamReader(hdfs.open(srcPath)));
>>>
>>> String[] splited = line.split("\\s+");
>>>     for( i=0;i<splited.length;i++)
>>>  {
>>>      String sp[]=splited[i].split(",");
>>>      for( k=0;k<sp.length;k++)
>>>  {
>>>
>>>    if(!sp[k].isEmpty()){
>>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>>> if((sp[k].equalsIgnoreCase("C"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>>         while (tokenizer.hasMoreTokens()) {
>>>           word.set(tokenizer.nextToken());
>>>           output.collect(word, one);
>>>         }
>>> }
>>>       }
>>>     }
>>> }
>>>  } catch (IOException e) {
>>>     e.printStackTrace();
>>>  }
>>> }
>>> }
>>>     public static class Reduce extends MapReduceBase implements
>>> Reducer<Text, IntWritable, Text, IntWritable> {
>>>       public void reduce(Text key, Iterator<IntWritable> values,
>>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>>> IOException {
>>>         int sum = 0;
>>>         while (values.hasNext()) {
>>>           sum += values.next().get();
>>>         }
>>>         output.collect(key, new IntWritable(sum));
>>>       }
>>>     }
>>>     public static void main(String[] args) throws Exception {
>>>
>>>
>>>       JobConf conf = new JobConf(WordCount.class);
>>>       conf.setJobName("wordcount");
>>>       conf.setOutputKeyClass(Text.class);
>>>       conf.setOutputValueClass(IntWritable.class);
>>>       conf.setMapperClass(Map.class);
>>>       conf.setCombinerClass(Reduce.class);
>>>       conf.setReducerClass(Reduce.class);
>>>       conf.setInputFormat(TextInputFormat.class);
>>>       conf.setOutputFormat(TextOutputFormat.class);
>>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>>       JobClient.runJob(conf);
>>>     }
>>>  }
>>>
>>>
>>>
>>> Please help
>>>
>>> Thanks in advance.
>>>
>>> Ranjini
>>>
>>>
>>>
>>
>>
>

Re: Need FileName with Content

Posted by Ranjini Rathinam <ra...@gmail.com>.

Hi,

If we give the below code,
=======================
word.set("filename"+"    "+tokenizer.nextToken());
output.collect(word,one);
======================

The output is wrong. because it shows the

filename   word   occurance
vinitha       java       4
vinitha         oracle      3
sony           java       4
sony          oracle      3


Here vinitha does not have oracle word . Similarlly sony does not have java
has word. File name is merging for  all words.

I need the output has given below

 filename   word   occurance

vinitha       java       4
vinitha         C++    3
sony           ETL     4
sony          oracle      3


 Need fileaName along with the word in that particular file only. No merge
should happen.

Please help me out for this issue.

Please help.

Thanks in advance.

Ranjini




On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam
<ra...@gmail.com>wrote:

>
>
> ---------- Forwarded message ----------
> From: Stanley Shi <ss...@gopivotal.com>
> Date: Thu, Mar 20, 2014 at 7:39 AM
> Subject: Re: Need FileName with Content
> To: user@hadoop.apache.org
>
>
> You want to do a word count for each file, but the code give you a word
> count for all the files, right?
>
> =====
>  word.set(tokenizer.nextToken());
>           output.collect(word, one);
> ======
> change it to:
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
>
>
>
>
>  Regards,
> *Stanley Shi,*
>
>
>
> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:
>
>> Hi,
>>
>> I have folder named INPUT.
>>
>> Inside INPUT i have 5 resume are there.
>>
>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>> Found 5 items
>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/vinitha.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/sony.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/ravi.txt
>> hduser@localhost:~/Ranjini$
>>
>> I have to process the folder and the content .
>>
>> I need ouput has
>>
>> filename   word   occurance
>> vinitha       java       4
>> sony          oracle      3
>>
>>
>>
>> But iam not getting the filename.  Has the input file content are merged
>> file name is not getting correct .
>>
>>
>> please help in this issue to fix.  I have given by code below
>>
>>
>>  import java.io.IOException;
>>  import java.util.*;
>>  import org.apache.hadoop.fs.Path;
>>  import org.apache.hadoop.conf.*;
>>  import org.apache.hadoop.io.*;
>>  import org.apache.hadoop.mapred.*;
>>  import org.apache.hadoop.util.*;
>> import java.io.File;
>> import java.io.FileReader;
>> import java.io.FileWriter;
>> import java.io.IOException;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.FileStatus;
>> import org.apache.hadoop.conf.*;
>> import org.apache.hadoop.io.*;
>> import org.apache.hadoop.mapred.*;
>> import org.apache.hadoop.util.*;
>> import org.apache.hadoop.mapred.lib.*;
>>
>>  public class WordCount {
>>     public static class Map extends MapReduceBase implements
>> Mapper<LongWritable, Text, Text, IntWritable> {
>>      private final static IntWritable one = new IntWritable(1);
>>       private Text word = new Text();
>>       public void map(LongWritable key, Text value, OutputCollector<Text,
>> IntWritable> output, Reporter reporter) throws IOException {
>>    FSDataInputStream fs=null;
>>    FileSystem hdfs = null;
>>    String line = value.toString();
>>          int i=0,k=0;
>>   try{
>>    Configuration configuration = new Configuration();
>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>
>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>
>>    hdfs = FileSystem.get(configuration);
>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>    fs=hdfs.open(srcPath);
>>    BufferedReader br=new BufferedReader(new
>> InputStreamReader(hdfs.open(srcPath)));
>>
>> String[] splited = line.split("\\s+");
>>     for( i=0;i<splited.length;i++)
>>  {
>>      String sp[]=splited[i].split(",");
>>      for( k=0;k<sp.length;k++)
>>  {
>>
>>    if(!sp[k].isEmpty()){
>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>> if((sp[k].equalsIgnoreCase("C"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>>       }
>>     }
>> }
>>  } catch (IOException e) {
>>     e.printStackTrace();
>>  }
>> }
>> }
>>     public static class Reduce extends MapReduceBase implements
>> Reducer<Text, IntWritable, Text, IntWritable> {
>>       public void reduce(Text key, Iterator<IntWritable> values,
>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>> IOException {
>>         int sum = 0;
>>         while (values.hasNext()) {
>>           sum += values.next().get();
>>         }
>>         output.collect(key, new IntWritable(sum));
>>       }
>>     }
>>     public static void main(String[] args) throws Exception {
>>
>>
>>       JobConf conf = new JobConf(WordCount.class);
>>       conf.setJobName("wordcount");
>>       conf.setOutputKeyClass(Text.class);
>>       conf.setOutputValueClass(IntWritable.class);
>>       conf.setMapperClass(Map.class);
>>       conf.setCombinerClass(Reduce.class);
>>       conf.setReducerClass(Reduce.class);
>>       conf.setInputFormat(TextInputFormat.class);
>>       conf.setOutputFormat(TextOutputFormat.class);
>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>       JobClient.runJob(conf);
>>     }
>>  }
>>
>>
>>
>> Please help
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>>
>>
>
>

Re: Need FileName with Content

Posted by Ranjini Rathinam <ra...@gmail.com>.

Hi,

If we give the below code,
=======================
word.set("filename"+"    "+tokenizer.nextToken());
output.collect(word,one);
======================

The output is wrong. because it shows the

filename   word   occurance
vinitha       java       4
vinitha         oracle      3
sony           java       4
sony          oracle      3


Here vinitha does not have oracle word . Similarlly sony does not have java
has word. File name is merging for  all words.

I need the output has given below

 filename   word   occurance

vinitha       java       4
vinitha         C++    3
sony           ETL     4
sony          oracle      3


 Need fileaName along with the word in that particular file only. No merge
should happen.

Please help me out for this issue.

Please help.

Thanks in advance.

Ranjini




On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam
<ra...@gmail.com>wrote:

>
>
> ---------- Forwarded message ----------
> From: Stanley Shi <ss...@gopivotal.com>
> Date: Thu, Mar 20, 2014 at 7:39 AM
> Subject: Re: Need FileName with Content
> To: user@hadoop.apache.org
>
>
> You want to do a word count for each file, but the code give you a word
> count for all the files, right?
>
> =====
>  word.set(tokenizer.nextToken());
>           output.collect(word, one);
> ======
> change it to:
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
>
>
>
>
>  Regards,
> *Stanley Shi,*
>
>
>
> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:
>
>> Hi,
>>
>> I have folder named INPUT.
>>
>> Inside INPUT i have 5 resume are there.
>>
>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>> Found 5 items
>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/vinitha.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/sony.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/ravi.txt
>> hduser@localhost:~/Ranjini$
>>
>> I have to process the folder and the content .
>>
>> I need ouput has
>>
>> filename   word   occurance
>> vinitha       java       4
>> sony          oracle      3
>>
>>
>>
>> But iam not getting the filename.  Has the input file content are merged
>> file name is not getting correct .
>>
>>
>> please help in this issue to fix.  I have given by code below
>>
>>
>>  import java.io.IOException;
>>  import java.util.*;
>>  import org.apache.hadoop.fs.Path;
>>  import org.apache.hadoop.conf.*;
>>  import org.apache.hadoop.io.*;
>>  import org.apache.hadoop.mapred.*;
>>  import org.apache.hadoop.util.*;
>> import java.io.File;
>> import java.io.FileReader;
>> import java.io.FileWriter;
>> import java.io.IOException;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.FileStatus;
>> import org.apache.hadoop.conf.*;
>> import org.apache.hadoop.io.*;
>> import org.apache.hadoop.mapred.*;
>> import org.apache.hadoop.util.*;
>> import org.apache.hadoop.mapred.lib.*;
>>
>>  public class WordCount {
>>     public static class Map extends MapReduceBase implements
>> Mapper<LongWritable, Text, Text, IntWritable> {
>>      private final static IntWritable one = new IntWritable(1);
>>       private Text word = new Text();
>>       public void map(LongWritable key, Text value, OutputCollector<Text,
>> IntWritable> output, Reporter reporter) throws IOException {
>>    FSDataInputStream fs=null;
>>    FileSystem hdfs = null;
>>    String line = value.toString();
>>          int i=0,k=0;
>>   try{
>>    Configuration configuration = new Configuration();
>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>
>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>
>>    hdfs = FileSystem.get(configuration);
>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>    fs=hdfs.open(srcPath);
>>    BufferedReader br=new BufferedReader(new
>> InputStreamReader(hdfs.open(srcPath)));
>>
>> String[] splited = line.split("\\s+");
>>     for( i=0;i<splited.length;i++)
>>  {
>>      String sp[]=splited[i].split(",");
>>      for( k=0;k<sp.length;k++)
>>  {
>>
>>    if(!sp[k].isEmpty()){
>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>> if((sp[k].equalsIgnoreCase("C"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>>       }
>>     }
>> }
>>  } catch (IOException e) {
>>     e.printStackTrace();
>>  }
>> }
>> }
>>     public static class Reduce extends MapReduceBase implements
>> Reducer<Text, IntWritable, Text, IntWritable> {
>>       public void reduce(Text key, Iterator<IntWritable> values,
>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>> IOException {
>>         int sum = 0;
>>         while (values.hasNext()) {
>>           sum += values.next().get();
>>         }
>>         output.collect(key, new IntWritable(sum));
>>       }
>>     }
>>     public static void main(String[] args) throws Exception {
>>
>>
>>       JobConf conf = new JobConf(WordCount.class);
>>       conf.setJobName("wordcount");
>>       conf.setOutputKeyClass(Text.class);
>>       conf.setOutputValueClass(IntWritable.class);
>>       conf.setMapperClass(Map.class);
>>       conf.setCombinerClass(Reduce.class);
>>       conf.setReducerClass(Reduce.class);
>>       conf.setInputFormat(TextInputFormat.class);
>>       conf.setOutputFormat(TextOutputFormat.class);
>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>       JobClient.runJob(conf);
>>     }
>>  }
>>
>>
>>
>> Please help
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>>
>>
>
>

Re: Need FileName with Content

Posted by Ranjini Rathinam <ra...@gmail.com>.

Hi,

If we give the below code,
=======================
word.set("filename"+"    "+tokenizer.nextToken());
output.collect(word,one);
======================

The output is wrong. because it shows the

filename   word   occurance
vinitha       java       4
vinitha         oracle      3
sony           java       4
sony          oracle      3


Here vinitha does not have oracle word . Similarlly sony does not have java
has word. File name is merging for  all words.

I need the output has given below

 filename   word   occurance

vinitha       java       4
vinitha         C++    3
sony           ETL     4
sony          oracle      3


 Need fileaName along with the word in that particular file only. No merge
should happen.

Please help me out for this issue.

Please help.

Thanks in advance.

Ranjini




On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam
<ra...@gmail.com>wrote:

>
>
> ---------- Forwarded message ----------
> From: Stanley Shi <ss...@gopivotal.com>
> Date: Thu, Mar 20, 2014 at 7:39 AM
> Subject: Re: Need FileName with Content
> To: user@hadoop.apache.org
>
>
> You want to do a word count for each file, but the code give you a word
> count for all the files, right?
>
> =====
>  word.set(tokenizer.nextToken());
>           output.collect(word, one);
> ======
> change it to:
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
>
>
>
>
>  Regards,
> *Stanley Shi,*
>
>
>
> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:
>
>> Hi,
>>
>> I have folder named INPUT.
>>
>> Inside INPUT i have 5 resume are there.
>>
>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>> Found 5 items
>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/vinitha.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/sony.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/ravi.txt
>> hduser@localhost:~/Ranjini$
>>
>> I have to process the folder and the content .
>>
>> I need ouput has
>>
>> filename   word   occurance
>> vinitha       java       4
>> sony          oracle      3
>>
>>
>>
>> But iam not getting the filename.  Has the input file content are merged
>> file name is not getting correct .
>>
>>
>> please help in this issue to fix.  I have given by code below
>>
>>
>>  import java.io.IOException;
>>  import java.util.*;
>>  import org.apache.hadoop.fs.Path;
>>  import org.apache.hadoop.conf.*;
>>  import org.apache.hadoop.io.*;
>>  import org.apache.hadoop.mapred.*;
>>  import org.apache.hadoop.util.*;
>> import java.io.File;
>> import java.io.FileReader;
>> import java.io.FileWriter;
>> import java.io.IOException;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.FileStatus;
>> import org.apache.hadoop.conf.*;
>> import org.apache.hadoop.io.*;
>> import org.apache.hadoop.mapred.*;
>> import org.apache.hadoop.util.*;
>> import org.apache.hadoop.mapred.lib.*;
>>
>>  public class WordCount {
>>     public static class Map extends MapReduceBase implements
>> Mapper<LongWritable, Text, Text, IntWritable> {
>>      private final static IntWritable one = new IntWritable(1);
>>       private Text word = new Text();
>>       public void map(LongWritable key, Text value, OutputCollector<Text,
>> IntWritable> output, Reporter reporter) throws IOException {
>>    FSDataInputStream fs=null;
>>    FileSystem hdfs = null;
>>    String line = value.toString();
>>          int i=0,k=0;
>>   try{
>>    Configuration configuration = new Configuration();
>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>
>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>
>>    hdfs = FileSystem.get(configuration);
>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>    fs=hdfs.open(srcPath);
>>    BufferedReader br=new BufferedReader(new
>> InputStreamReader(hdfs.open(srcPath)));
>>
>> String[] splited = line.split("\\s+");
>>     for( i=0;i<splited.length;i++)
>>  {
>>      String sp[]=splited[i].split(",");
>>      for( k=0;k<sp.length;k++)
>>  {
>>
>>    if(!sp[k].isEmpty()){
>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>> if((sp[k].equalsIgnoreCase("C"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>>       }
>>     }
>> }
>>  } catch (IOException e) {
>>     e.printStackTrace();
>>  }
>> }
>> }
>>     public static class Reduce extends MapReduceBase implements
>> Reducer<Text, IntWritable, Text, IntWritable> {
>>       public void reduce(Text key, Iterator<IntWritable> values,
>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>> IOException {
>>         int sum = 0;
>>         while (values.hasNext()) {
>>           sum += values.next().get();
>>         }
>>         output.collect(key, new IntWritable(sum));
>>       }
>>     }
>>     public static void main(String[] args) throws Exception {
>>
>>
>>       JobConf conf = new JobConf(WordCount.class);
>>       conf.setJobName("wordcount");
>>       conf.setOutputKeyClass(Text.class);
>>       conf.setOutputValueClass(IntWritable.class);
>>       conf.setMapperClass(Map.class);
>>       conf.setCombinerClass(Reduce.class);
>>       conf.setReducerClass(Reduce.class);
>>       conf.setInputFormat(TextInputFormat.class);
>>       conf.setOutputFormat(TextOutputFormat.class);
>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>       JobClient.runJob(conf);
>>     }
>>  }
>>
>>
>>
>> Please help
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>>
>>
>
>

Re: Need FileName with Content

Posted by Ranjini Rathinam <ra...@gmail.com>.

Hi,

If we give the below code,
=======================
word.set("filename"+"    "+tokenizer.nextToken());
output.collect(word,one);
======================

The output is wrong. because it shows the

filename   word   occurance
vinitha       java       4
vinitha         oracle      3
sony           java       4
sony          oracle      3


Here vinitha does not have oracle word . Similarlly sony does not have java
has word. File name is merging for  all words.

I need the output has given below

 filename   word   occurance

vinitha       java       4
vinitha         C++    3
sony           ETL     4
sony          oracle      3


 Need fileaName along with the word in that particular file only. No merge
should happen.

Please help me out for this issue.

Please help.

Thanks in advance.

Ranjini




On Thu, Mar 20, 2014 at 10:56 AM, Ranjini Rathinam
<ra...@gmail.com>wrote:

>
>
> ---------- Forwarded message ----------
> From: Stanley Shi <ss...@gopivotal.com>
> Date: Thu, Mar 20, 2014 at 7:39 AM
> Subject: Re: Need FileName with Content
> To: user@hadoop.apache.org
>
>
> You want to do a word count for each file, but the code give you a word
> count for all the files, right?
>
> =====
>  word.set(tokenizer.nextToken());
>           output.collect(word, one);
> ======
> change it to:
> word.set("filename"+"    "+tokenizer.nextToken());
> output.collect(word,one);
>
>
>
>
>  Regards,
> *Stanley Shi,*
>
>
>
> On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:
>
>> Hi,
>>
>> I have folder named INPUT.
>>
>> Inside INPUT i have 5 resume are there.
>>
>> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
>> Found 5 items
>> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
>> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
>> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/vinitha.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/sony.txt
>> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
>> /user/hduser/INPUT/ravi.txt
>> hduser@localhost:~/Ranjini$
>>
>> I have to process the folder and the content .
>>
>> I need ouput has
>>
>> filename   word   occurance
>> vinitha       java       4
>> sony          oracle      3
>>
>>
>>
>> But iam not getting the filename.  Has the input file content are merged
>> file name is not getting correct .
>>
>>
>> please help in this issue to fix.  I have given by code below
>>
>>
>>  import java.io.IOException;
>>  import java.util.*;
>>  import org.apache.hadoop.fs.Path;
>>  import org.apache.hadoop.conf.*;
>>  import org.apache.hadoop.io.*;
>>  import org.apache.hadoop.mapred.*;
>>  import org.apache.hadoop.util.*;
>> import java.io.File;
>> import java.io.FileReader;
>> import java.io.FileWriter;
>> import java.io.IOException;
>> import org.apache.hadoop.fs.Path;
>> import org.apache.hadoop.conf.Configuration;
>> import org.apache.hadoop.fs.FileSystem;
>> import org.apache.hadoop.fs.FileStatus;
>> import org.apache.hadoop.conf.*;
>> import org.apache.hadoop.io.*;
>> import org.apache.hadoop.mapred.*;
>> import org.apache.hadoop.util.*;
>> import org.apache.hadoop.mapred.lib.*;
>>
>>  public class WordCount {
>>     public static class Map extends MapReduceBase implements
>> Mapper<LongWritable, Text, Text, IntWritable> {
>>      private final static IntWritable one = new IntWritable(1);
>>       private Text word = new Text();
>>       public void map(LongWritable key, Text value, OutputCollector<Text,
>> IntWritable> output, Reporter reporter) throws IOException {
>>    FSDataInputStream fs=null;
>>    FileSystem hdfs = null;
>>    String line = value.toString();
>>          int i=0,k=0;
>>   try{
>>    Configuration configuration = new Configuration();
>>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>>
>>    Path srcPath = new Path("/user/hduser/INPUT/");
>>
>>    hdfs = FileSystem.get(configuration);
>>    FileStatus[] status = hdfs.listStatus(srcPath);
>>    fs=hdfs.open(srcPath);
>>    BufferedReader br=new BufferedReader(new
>> InputStreamReader(hdfs.open(srcPath)));
>>
>> String[] splited = line.split("\\s+");
>>     for( i=0;i<splited.length;i++)
>>  {
>>      String sp[]=splited[i].split(",");
>>      for( k=0;k<sp.length;k++)
>>  {
>>
>>    if(!sp[k].isEmpty()){
>> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
>> if((sp[k].equalsIgnoreCase("C"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>> if((sp[k].equalsIgnoreCase("JAVA"))){
>>         while (tokenizer.hasMoreTokens()) {
>>           word.set(tokenizer.nextToken());
>>           output.collect(word, one);
>>         }
>> }
>>       }
>>     }
>> }
>>  } catch (IOException e) {
>>     e.printStackTrace();
>>  }
>> }
>> }
>>     public static class Reduce extends MapReduceBase implements
>> Reducer<Text, IntWritable, Text, IntWritable> {
>>       public void reduce(Text key, Iterator<IntWritable> values,
>> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
>> IOException {
>>         int sum = 0;
>>         while (values.hasNext()) {
>>           sum += values.next().get();
>>         }
>>         output.collect(key, new IntWritable(sum));
>>       }
>>     }
>>     public static void main(String[] args) throws Exception {
>>
>>
>>       JobConf conf = new JobConf(WordCount.class);
>>       conf.setJobName("wordcount");
>>       conf.setOutputKeyClass(Text.class);
>>       conf.setOutputValueClass(IntWritable.class);
>>       conf.setMapperClass(Map.class);
>>       conf.setCombinerClass(Reduce.class);
>>       conf.setReducerClass(Reduce.class);
>>       conf.setInputFormat(TextInputFormat.class);
>>       conf.setOutputFormat(TextOutputFormat.class);
>>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>>       JobClient.runJob(conf);
>>     }
>>  }
>>
>>
>>
>> Please help
>>
>> Thanks in advance.
>>
>> Ranjini
>>
>>
>>
>
>

Re: Need FileName with Content

Posted by Stanley Shi <ss...@gopivotal.com>.

You want to do a word count for each file, but the code give you a word
count for all the files, right?

=====
word.set(tokenizer.nextToken());
          output.collect(word, one);
======
change it to:
word.set("filename"+"    "+tokenizer.nextToken());
output.collect(word,one);




Regards,
*Stanley Shi,*



On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:

> Hi,
>
> I have folder named INPUT.
>
> Inside INPUT i have 5 resume are there.
>
> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
> Found 5 items
> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/vinitha.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/sony.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/ravi.txt
> hduser@localhost:~/Ranjini$
>
> I have to process the folder and the content .
>
> I need ouput has
>
> filename   word   occurance
> vinitha       java       4
> sony          oracle      3
>
>
>
> But iam not getting the filename.  Has the input file content are merged
> file name is not getting correct .
>
>
> please help in this issue to fix.  I have given by code below
>
>
>  import java.io.IOException;
>  import java.util.*;
>  import org.apache.hadoop.fs.Path;
>  import org.apache.hadoop.conf.*;
>  import org.apache.hadoop.io.*;
>  import org.apache.hadoop.mapred.*;
>  import org.apache.hadoop.util.*;
> import java.io.File;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapred.lib.*;
>
>  public class WordCount {
>     public static class Map extends MapReduceBase implements
> Mapper<LongWritable, Text, Text, IntWritable> {
>      private final static IntWritable one = new IntWritable(1);
>       private Text word = new Text();
>       public void map(LongWritable key, Text value, OutputCollector<Text,
> IntWritable> output, Reporter reporter) throws IOException {
>    FSDataInputStream fs=null;
>    FileSystem hdfs = null;
>    String line = value.toString();
>          int i=0,k=0;
>   try{
>    Configuration configuration = new Configuration();
>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>
>    Path srcPath = new Path("/user/hduser/INPUT/");
>
>    hdfs = FileSystem.get(configuration);
>    FileStatus[] status = hdfs.listStatus(srcPath);
>    fs=hdfs.open(srcPath);
>    BufferedReader br=new BufferedReader(new
> InputStreamReader(hdfs.open(srcPath)));
>
> String[] splited = line.split("\\s+");
>     for( i=0;i<splited.length;i++)
>  {
>      String sp[]=splited[i].split(",");
>      for( k=0;k<sp.length;k++)
>  {
>
>    if(!sp[k].isEmpty()){
> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
> if((sp[k].equalsIgnoreCase("C"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
> if((sp[k].equalsIgnoreCase("JAVA"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
>       }
>     }
> }
>  } catch (IOException e) {
>     e.printStackTrace();
>  }
> }
> }
>     public static class Reduce extends MapReduceBase implements
> Reducer<Text, IntWritable, Text, IntWritable> {
>       public void reduce(Text key, Iterator<IntWritable> values,
> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
> IOException {
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>         output.collect(key, new IntWritable(sum));
>       }
>     }
>     public static void main(String[] args) throws Exception {
>
>
>       JobConf conf = new JobConf(WordCount.class);
>       conf.setJobName("wordcount");
>       conf.setOutputKeyClass(Text.class);
>       conf.setOutputValueClass(IntWritable.class);
>       conf.setMapperClass(Map.class);
>       conf.setCombinerClass(Reduce.class);
>       conf.setReducerClass(Reduce.class);
>       conf.setInputFormat(TextInputFormat.class);
>       conf.setOutputFormat(TextOutputFormat.class);
>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>       JobClient.runJob(conf);
>     }
>  }
>
>
>
> Please help
>
> Thanks in advance.
>
> Ranjini
>
>
>

Re: Need FileName with Content

Posted by Stanley Shi <ss...@gopivotal.com>.

You want to do a word count for each file, but the code give you a word
count for all the files, right?

=====
word.set(tokenizer.nextToken());
          output.collect(word, one);
======
change it to:
word.set("filename"+"    "+tokenizer.nextToken());
output.collect(word,one);




Regards,
*Stanley Shi,*



On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:

> Hi,
>
> I have folder named INPUT.
>
> Inside INPUT i have 5 resume are there.
>
> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
> Found 5 items
> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/vinitha.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/sony.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/ravi.txt
> hduser@localhost:~/Ranjini$
>
> I have to process the folder and the content .
>
> I need ouput has
>
> filename   word   occurance
> vinitha       java       4
> sony          oracle      3
>
>
>
> But iam not getting the filename.  Has the input file content are merged
> file name is not getting correct .
>
>
> please help in this issue to fix.  I have given by code below
>
>
>  import java.io.IOException;
>  import java.util.*;
>  import org.apache.hadoop.fs.Path;
>  import org.apache.hadoop.conf.*;
>  import org.apache.hadoop.io.*;
>  import org.apache.hadoop.mapred.*;
>  import org.apache.hadoop.util.*;
> import java.io.File;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapred.lib.*;
>
>  public class WordCount {
>     public static class Map extends MapReduceBase implements
> Mapper<LongWritable, Text, Text, IntWritable> {
>      private final static IntWritable one = new IntWritable(1);
>       private Text word = new Text();
>       public void map(LongWritable key, Text value, OutputCollector<Text,
> IntWritable> output, Reporter reporter) throws IOException {
>    FSDataInputStream fs=null;
>    FileSystem hdfs = null;
>    String line = value.toString();
>          int i=0,k=0;
>   try{
>    Configuration configuration = new Configuration();
>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>
>    Path srcPath = new Path("/user/hduser/INPUT/");
>
>    hdfs = FileSystem.get(configuration);
>    FileStatus[] status = hdfs.listStatus(srcPath);
>    fs=hdfs.open(srcPath);
>    BufferedReader br=new BufferedReader(new
> InputStreamReader(hdfs.open(srcPath)));
>
> String[] splited = line.split("\\s+");
>     for( i=0;i<splited.length;i++)
>  {
>      String sp[]=splited[i].split(",");
>      for( k=0;k<sp.length;k++)
>  {
>
>    if(!sp[k].isEmpty()){
> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
> if((sp[k].equalsIgnoreCase("C"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
> if((sp[k].equalsIgnoreCase("JAVA"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
>       }
>     }
> }
>  } catch (IOException e) {
>     e.printStackTrace();
>  }
> }
> }
>     public static class Reduce extends MapReduceBase implements
> Reducer<Text, IntWritable, Text, IntWritable> {
>       public void reduce(Text key, Iterator<IntWritable> values,
> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
> IOException {
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>         output.collect(key, new IntWritable(sum));
>       }
>     }
>     public static void main(String[] args) throws Exception {
>
>
>       JobConf conf = new JobConf(WordCount.class);
>       conf.setJobName("wordcount");
>       conf.setOutputKeyClass(Text.class);
>       conf.setOutputValueClass(IntWritable.class);
>       conf.setMapperClass(Map.class);
>       conf.setCombinerClass(Reduce.class);
>       conf.setReducerClass(Reduce.class);
>       conf.setInputFormat(TextInputFormat.class);
>       conf.setOutputFormat(TextOutputFormat.class);
>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>       JobClient.runJob(conf);
>     }
>  }
>
>
>
> Please help
>
> Thanks in advance.
>
> Ranjini
>
>
>

Re: Need FileName with Content

Posted by Stanley Shi <ss...@gopivotal.com>.

You want to do a word count for each file, but the code give you a word
count for all the files, right?

=====
word.set(tokenizer.nextToken());
          output.collect(word, one);
======
change it to:
word.set("filename"+"    "+tokenizer.nextToken());
output.collect(word,one);




Regards,
*Stanley Shi,*



On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:

> Hi,
>
> I have folder named INPUT.
>
> Inside INPUT i have 5 resume are there.
>
> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
> Found 5 items
> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/vinitha.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/sony.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/ravi.txt
> hduser@localhost:~/Ranjini$
>
> I have to process the folder and the content .
>
> I need ouput has
>
> filename   word   occurance
> vinitha       java       4
> sony          oracle      3
>
>
>
> But iam not getting the filename.  Has the input file content are merged
> file name is not getting correct .
>
>
> please help in this issue to fix.  I have given by code below
>
>
>  import java.io.IOException;
>  import java.util.*;
>  import org.apache.hadoop.fs.Path;
>  import org.apache.hadoop.conf.*;
>  import org.apache.hadoop.io.*;
>  import org.apache.hadoop.mapred.*;
>  import org.apache.hadoop.util.*;
> import java.io.File;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapred.lib.*;
>
>  public class WordCount {
>     public static class Map extends MapReduceBase implements
> Mapper<LongWritable, Text, Text, IntWritable> {
>      private final static IntWritable one = new IntWritable(1);
>       private Text word = new Text();
>       public void map(LongWritable key, Text value, OutputCollector<Text,
> IntWritable> output, Reporter reporter) throws IOException {
>    FSDataInputStream fs=null;
>    FileSystem hdfs = null;
>    String line = value.toString();
>          int i=0,k=0;
>   try{
>    Configuration configuration = new Configuration();
>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>
>    Path srcPath = new Path("/user/hduser/INPUT/");
>
>    hdfs = FileSystem.get(configuration);
>    FileStatus[] status = hdfs.listStatus(srcPath);
>    fs=hdfs.open(srcPath);
>    BufferedReader br=new BufferedReader(new
> InputStreamReader(hdfs.open(srcPath)));
>
> String[] splited = line.split("\\s+");
>     for( i=0;i<splited.length;i++)
>  {
>      String sp[]=splited[i].split(",");
>      for( k=0;k<sp.length;k++)
>  {
>
>    if(!sp[k].isEmpty()){
> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
> if((sp[k].equalsIgnoreCase("C"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
> if((sp[k].equalsIgnoreCase("JAVA"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
>       }
>     }
> }
>  } catch (IOException e) {
>     e.printStackTrace();
>  }
> }
> }
>     public static class Reduce extends MapReduceBase implements
> Reducer<Text, IntWritable, Text, IntWritable> {
>       public void reduce(Text key, Iterator<IntWritable> values,
> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
> IOException {
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>         output.collect(key, new IntWritable(sum));
>       }
>     }
>     public static void main(String[] args) throws Exception {
>
>
>       JobConf conf = new JobConf(WordCount.class);
>       conf.setJobName("wordcount");
>       conf.setOutputKeyClass(Text.class);
>       conf.setOutputValueClass(IntWritable.class);
>       conf.setMapperClass(Map.class);
>       conf.setCombinerClass(Reduce.class);
>       conf.setReducerClass(Reduce.class);
>       conf.setInputFormat(TextInputFormat.class);
>       conf.setOutputFormat(TextOutputFormat.class);
>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>       JobClient.runJob(conf);
>     }
>  }
>
>
>
> Please help
>
> Thanks in advance.
>
> Ranjini
>
>
>

Re: Need FileName with Content

Posted by Stanley Shi <ss...@gopivotal.com>.

You want to do a word count for each file, but the code give you a word
count for all the files, right?

=====
word.set(tokenizer.nextToken());
          output.collect(word, one);
======
change it to:
word.set("filename"+"    "+tokenizer.nextToken());
output.collect(word,one);




Regards,
*Stanley Shi,*



On Wed, Mar 19, 2014 at 8:50 PM, Ranjini Rathinam <ra...@gmail.com>wrote:

> Hi,
>
> I have folder named INPUT.
>
> Inside INPUT i have 5 resume are there.
>
> hduser@localhost:~/Ranjini$ hadoop fs -ls /user/hduser/INPUT
> Found 5 items
> -rw-r--r--   1 hduser supergroup       5438 2014-03-18 15:20
> /user/hduser/INPUT/Rakesh Chowdary_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       6022 2014-03-18 15:22
> /user/hduser/INPUT/Ramarao Devineni_Microstrategy.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/vinitha.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/sony.txt
> -rw-r--r--   1 hduser supergroup       3517 2014-03-18 15:21
> /user/hduser/INPUT/ravi.txt
> hduser@localhost:~/Ranjini$
>
> I have to process the folder and the content .
>
> I need ouput has
>
> filename   word   occurance
> vinitha       java       4
> sony          oracle      3
>
>
>
> But iam not getting the filename.  Has the input file content are merged
> file name is not getting correct .
>
>
> please help in this issue to fix.  I have given by code below
>
>
>  import java.io.IOException;
>  import java.util.*;
>  import org.apache.hadoop.fs.Path;
>  import org.apache.hadoop.conf.*;
>  import org.apache.hadoop.io.*;
>  import org.apache.hadoop.mapred.*;
>  import org.apache.hadoop.util.*;
> import java.io.File;
> import java.io.FileReader;
> import java.io.FileWriter;
> import java.io.IOException;
> import org.apache.hadoop.fs.Path;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.fs.FileSystem;
> import org.apache.hadoop.fs.FileStatus;
> import org.apache.hadoop.conf.*;
> import org.apache.hadoop.io.*;
> import org.apache.hadoop.mapred.*;
> import org.apache.hadoop.util.*;
> import org.apache.hadoop.mapred.lib.*;
>
>  public class WordCount {
>     public static class Map extends MapReduceBase implements
> Mapper<LongWritable, Text, Text, IntWritable> {
>      private final static IntWritable one = new IntWritable(1);
>       private Text word = new Text();
>       public void map(LongWritable key, Text value, OutputCollector<Text,
> IntWritable> output, Reporter reporter) throws IOException {
>    FSDataInputStream fs=null;
>    FileSystem hdfs = null;
>    String line = value.toString();
>          int i=0,k=0;
>   try{
>    Configuration configuration = new Configuration();
>       configuration.set("fs.default.name", "hdfs://localhost:4440/");
>
>    Path srcPath = new Path("/user/hduser/INPUT/");
>
>    hdfs = FileSystem.get(configuration);
>    FileStatus[] status = hdfs.listStatus(srcPath);
>    fs=hdfs.open(srcPath);
>    BufferedReader br=new BufferedReader(new
> InputStreamReader(hdfs.open(srcPath)));
>
> String[] splited = line.split("\\s+");
>     for( i=0;i<splited.length;i++)
>  {
>      String sp[]=splited[i].split(",");
>      for( k=0;k<sp.length;k++)
>  {
>
>    if(!sp[k].isEmpty()){
> StringTokenizer tokenizer = new StringTokenizer(sp[k]);
> if((sp[k].equalsIgnoreCase("C"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
> if((sp[k].equalsIgnoreCase("JAVA"))){
>         while (tokenizer.hasMoreTokens()) {
>           word.set(tokenizer.nextToken());
>           output.collect(word, one);
>         }
> }
>       }
>     }
> }
>  } catch (IOException e) {
>     e.printStackTrace();
>  }
> }
> }
>     public static class Reduce extends MapReduceBase implements
> Reducer<Text, IntWritable, Text, IntWritable> {
>       public void reduce(Text key, Iterator<IntWritable> values,
> OutputCollector<Text, IntWritable> output, Reporter reporter) throws
> IOException {
>         int sum = 0;
>         while (values.hasNext()) {
>           sum += values.next().get();
>         }
>         output.collect(key, new IntWritable(sum));
>       }
>     }
>     public static void main(String[] args) throws Exception {
>
>
>       JobConf conf = new JobConf(WordCount.class);
>       conf.setJobName("wordcount");
>       conf.setOutputKeyClass(Text.class);
>       conf.setOutputValueClass(IntWritable.class);
>       conf.setMapperClass(Map.class);
>       conf.setCombinerClass(Reduce.class);
>       conf.setReducerClass(Reduce.class);
>       conf.setInputFormat(TextInputFormat.class);
>       conf.setOutputFormat(TextOutputFormat.class);
>       FileInputFormat.setInputPaths(conf, new Path(args[0]));
>       FileOutputFormat.setOutputPath(conf, new Path(args[1]));
>       JobClient.runJob(conf);
>     }
>  }
>
>
>
> Please help
>
> Thanks in advance.
>
> Ranjini
>
>
>