You are viewing a plain text version of this content. The canonical link for it is here.

Posted to common-user@hadoop.apache.org by Chhaya Vishwakarma <Ch...@lntinfotech.com> on 2013/11/28 07:45:25 UTC

XML parsing in Hadoop

Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"

Re: XML parsing in Hadoop

Posted by Adam Kawa <ka...@gmail.com>.

Alternatively you can try an input format called WholeFileInputFormat
(nicely explained in "Hadoop: The Definitive Guide" by Tom White), where
you process a whole file as a record in a single map() method. Refer to a
book, for a code example.


2013/11/28 Devaraj K <de...@apache.org>

> Hi,
>
> Here this map() function will be called for every (key,value) pair (i.e.
> for every line of split in your Job because of TextInputFormat). This xml
> parsing code which you have written in map() function will be executed for
> every line of your input which is causing the problem.
>
> You can customize your InputFormat to read the xml file, instead of
> parsing in map() or you could place this parsing code in run() method by
> overriding it from Mapper.run(Context context).
>
>
> On Thu, Nov 28, 2013 at 12:15 PM, Chhaya Vishwakarma <
> Chhaya.Vishwakarma@lntinfotech.com> wrote:
>
>>  Hi,
>>
>>
>>
>>
>>
>> The below code parses XML file, Here the output of the code is correct
>> but the job takes long time for completion.
>>
>> It took 20 hours to parse 2MB file.
>>
>> Kindly suggest what changes could be done to increase the performance.
>>
>>
>>
>>
>>
>>
>>
>> package xml;
>>
>>
>>
>> import java.io.FileInputStream;
>>
>> import java.io.FileNotFoundException;
>>
>> import java.io.IOException;
>>
>> import java.util.*;
>>
>>
>>
>> import javax.xml.parsers.DocumentBuilder;
>>
>> import javax.xml.parsers.DocumentBuilderFactory;
>>
>> import javax.xml.parsers.ParserConfigurationException;
>>
>> import javax.xml.xpath.XPath;
>>
>> import javax.xml.xpath.XPathConstants;
>>
>> import javax.xml.xpath.XPathExpressionException;
>>
>> import javax.xml.xpath.XPathFactory;
>>
>>
>>
>> import org.apache.hadoop.fs.FSDataInputStream;
>>
>> import org.apache.hadoop.fs.FSInputStream;
>>
>> import org.apache.hadoop.fs.FileSystem;
>>
>> import org.apache.hadoop.fs.Path;
>>
>>
>>
>> import org.apache.hadoop.conf.*;
>>
>> import org.apache.hadoop.io.*;
>>
>>
>>
>> import org.apache.hadoop.mapred.JobConf;
>>
>> import org.apache.hadoop.mapreduce.*;
>>
>> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>>
>> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>>
>>
>>
>>
>>
>> import org.apache.log4j.Logger;
>>
>> import org.w3c.dom.Document;
>>
>> import org.w3c.dom.Element;
>>
>> import org.w3c.dom.NodeList;
>>
>> import org.xml.sax.SAXException;
>>
>>
>>
>>
>>
>> public class ReadXmlMR
>>
>> {
>>
>>                 static Logger log =
>> Logger.getLogger(ReadXmlMR.class.getName());
>>
>>                  public static String fileName = new String();
>>
>>                  public static Document dom;
>>
>>                  public void configure(JobConf job) {
>>
>>          fileName = job.get("map.input.file");
>>
>> }
>>
>>
>>
>>
>>
>>                 public static class Map extends
>> Mapper<LongWritable,Text,Text,Text>
>>
>>                {
>>
>>
>>
>>                                 public void map(LongWritable key, Text
>> value,Context context ) throws IOException, InterruptedException
>>
>>                                 {
>>
>>                                                 try {
>>
>>                                                                 FileSplit
>> fileSplit = (FileSplit)context.getInputSplit();
>>
>>
>> Configuration conf = context.getConfiguration();
>>
>>
>>
>>
>> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>>
>>
>>
>>
>> FSDataInputStream fstream1;
>>
>>                                                                 Path file
>> = fileSplit.getPath();
>>
>>                                                 FileSystem fs =
>> file.getFileSystem(conf);
>>
>>                                                 fstream1 =
>> fs.open(fileSplit.getPath());
>>
>>
>> DocumentBuilder db = dbf.newDocumentBuilder();
>>
>>                                                                 dom =
>> db.parse(fstream1);
>>
>>                                                                 Element
>> docEle = null;
>>
>>                                                                 docEle =
>> dom.getDocumentElement();
>>
>>
>>
>>                                                                 XPath
>> xpath = XPathFactory.newInstance().newXPath();
>>
>>
>>
>>                                                                 Object
>> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>>
>>
>>
>>                                                                 NodeList
>> nodes = (NodeList) result;
>>
>>
>>
>>
>>
>>                                                                 for (int
>> n = 2; n < nodes.getLength(); n++)
>>
>>
>>
>>                                                                 {
>>
>>
>> Text colvalue=new Text("");
>>
>>
>> Text nodename= new Text("");
>>
>>
>>
>>
>> nodename = new Text(nodes.item(n).getNodeName());
>>
>>
>> try{colvalue = new
>> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>>
>>
>> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>>
>>
>> context.write(nodename, colvalue);
>>
>>                                                                 }
>>
>>
>>
>>
>>
>>                                                                 } catch
>> (ParserConfigurationException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>                                                                 } catch
>> (SAXException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>
>>
>>                                                                 } catch
>> (XPathExpressionException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>                                                                 }
>>
>>
>>
>>                                                                 }
>>
>>
>>
>>                                 }
>>
>>
>>
>>
>>
>>
>>
>>                 public static void main(String[] args) throws Exception
>>
>>
>>
>>                 {
>>
>>
>>
>>                 Configuration conf = new Configuration();
>>
>>
>>
>>         Job job = new Job(conf, "XmlParsing");
>>
>>         job.setJarByClass(ReadXmlMR.class);
>>
>>                 job.setOutputKeyClass(Text.class);
>>
>>                 job.setOutputValueClass(Text.class);
>>
>>
>>
>>
>>
>>                 job.setMapperClass(Map.class);
>>
>>
>>
>>
>>
>>                 job.setInputFormatClass(TextInputFormat.class);
>>
>>                 job.setOutputFormatClass(TextOutputFormat.class);
>>
>>
>>
>>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>>
>>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>>
>>
>>
>>
>>
>>                 job.submit();
>>
>>
>>
>>                 job.waitForCompletion(true);
>>
>>
>>
>>
>>
>>                 }
>>
>>
>>
>> }
>>
>>
>>
>>
>>
>>
>>
>> Regards,
>>
>> Chhaya Vishwakarma
>>
>>
>>
>> ------------------------------
>> The contents of this e-mail and any attachment(s) may contain
>> confidential or privileged information for the intended recipient(s).
>> Unintended recipients are prohibited from taking action on the basis of
>> information in this e-mail and using or disseminating the information, and
>> must notify the sender and delete it from their system. L&T Infotech will
>> not accept responsibility or liability for the accuracy or completeness of,
>> or the presence of any virus or disabling code in this e-mail"
>>
>
>
>
> --
>
>
> Thanks
> Devaraj K
>

Re: XML parsing in Hadoop

Posted by Adam Kawa <ka...@gmail.com>.

Alternatively you can try an input format called WholeFileInputFormat
(nicely explained in "Hadoop: The Definitive Guide" by Tom White), where
you process a whole file as a record in a single map() method. Refer to a
book, for a code example.


2013/11/28 Devaraj K <de...@apache.org>

> Hi,
>
> Here this map() function will be called for every (key,value) pair (i.e.
> for every line of split in your Job because of TextInputFormat). This xml
> parsing code which you have written in map() function will be executed for
> every line of your input which is causing the problem.
>
> You can customize your InputFormat to read the xml file, instead of
> parsing in map() or you could place this parsing code in run() method by
> overriding it from Mapper.run(Context context).
>
>
> On Thu, Nov 28, 2013 at 12:15 PM, Chhaya Vishwakarma <
> Chhaya.Vishwakarma@lntinfotech.com> wrote:
>
>>  Hi,
>>
>>
>>
>>
>>
>> The below code parses XML file, Here the output of the code is correct
>> but the job takes long time for completion.
>>
>> It took 20 hours to parse 2MB file.
>>
>> Kindly suggest what changes could be done to increase the performance.
>>
>>
>>
>>
>>
>>
>>
>> package xml;
>>
>>
>>
>> import java.io.FileInputStream;
>>
>> import java.io.FileNotFoundException;
>>
>> import java.io.IOException;
>>
>> import java.util.*;
>>
>>
>>
>> import javax.xml.parsers.DocumentBuilder;
>>
>> import javax.xml.parsers.DocumentBuilderFactory;
>>
>> import javax.xml.parsers.ParserConfigurationException;
>>
>> import javax.xml.xpath.XPath;
>>
>> import javax.xml.xpath.XPathConstants;
>>
>> import javax.xml.xpath.XPathExpressionException;
>>
>> import javax.xml.xpath.XPathFactory;
>>
>>
>>
>> import org.apache.hadoop.fs.FSDataInputStream;
>>
>> import org.apache.hadoop.fs.FSInputStream;
>>
>> import org.apache.hadoop.fs.FileSystem;
>>
>> import org.apache.hadoop.fs.Path;
>>
>>
>>
>> import org.apache.hadoop.conf.*;
>>
>> import org.apache.hadoop.io.*;
>>
>>
>>
>> import org.apache.hadoop.mapred.JobConf;
>>
>> import org.apache.hadoop.mapreduce.*;
>>
>> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>>
>> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>>
>>
>>
>>
>>
>> import org.apache.log4j.Logger;
>>
>> import org.w3c.dom.Document;
>>
>> import org.w3c.dom.Element;
>>
>> import org.w3c.dom.NodeList;
>>
>> import org.xml.sax.SAXException;
>>
>>
>>
>>
>>
>> public class ReadXmlMR
>>
>> {
>>
>>                 static Logger log =
>> Logger.getLogger(ReadXmlMR.class.getName());
>>
>>                  public static String fileName = new String();
>>
>>                  public static Document dom;
>>
>>                  public void configure(JobConf job) {
>>
>>          fileName = job.get("map.input.file");
>>
>> }
>>
>>
>>
>>
>>
>>                 public static class Map extends
>> Mapper<LongWritable,Text,Text,Text>
>>
>>                {
>>
>>
>>
>>                                 public void map(LongWritable key, Text
>> value,Context context ) throws IOException, InterruptedException
>>
>>                                 {
>>
>>                                                 try {
>>
>>                                                                 FileSplit
>> fileSplit = (FileSplit)context.getInputSplit();
>>
>>
>> Configuration conf = context.getConfiguration();
>>
>>
>>
>>
>> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>>
>>
>>
>>
>> FSDataInputStream fstream1;
>>
>>                                                                 Path file
>> = fileSplit.getPath();
>>
>>                                                 FileSystem fs =
>> file.getFileSystem(conf);
>>
>>                                                 fstream1 =
>> fs.open(fileSplit.getPath());
>>
>>
>> DocumentBuilder db = dbf.newDocumentBuilder();
>>
>>                                                                 dom =
>> db.parse(fstream1);
>>
>>                                                                 Element
>> docEle = null;
>>
>>                                                                 docEle =
>> dom.getDocumentElement();
>>
>>
>>
>>                                                                 XPath
>> xpath = XPathFactory.newInstance().newXPath();
>>
>>
>>
>>                                                                 Object
>> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>>
>>
>>
>>                                                                 NodeList
>> nodes = (NodeList) result;
>>
>>
>>
>>
>>
>>                                                                 for (int
>> n = 2; n < nodes.getLength(); n++)
>>
>>
>>
>>                                                                 {
>>
>>
>> Text colvalue=new Text("");
>>
>>
>> Text nodename= new Text("");
>>
>>
>>
>>
>> nodename = new Text(nodes.item(n).getNodeName());
>>
>>
>> try{colvalue = new
>> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>>
>>
>> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>>
>>
>> context.write(nodename, colvalue);
>>
>>                                                                 }
>>
>>
>>
>>
>>
>>                                                                 } catch
>> (ParserConfigurationException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>                                                                 } catch
>> (SAXException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>
>>
>>                                                                 } catch
>> (XPathExpressionException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>                                                                 }
>>
>>
>>
>>                                                                 }
>>
>>
>>
>>                                 }
>>
>>
>>
>>
>>
>>
>>
>>                 public static void main(String[] args) throws Exception
>>
>>
>>
>>                 {
>>
>>
>>
>>                 Configuration conf = new Configuration();
>>
>>
>>
>>         Job job = new Job(conf, "XmlParsing");
>>
>>         job.setJarByClass(ReadXmlMR.class);
>>
>>                 job.setOutputKeyClass(Text.class);
>>
>>                 job.setOutputValueClass(Text.class);
>>
>>
>>
>>
>>
>>                 job.setMapperClass(Map.class);
>>
>>
>>
>>
>>
>>                 job.setInputFormatClass(TextInputFormat.class);
>>
>>                 job.setOutputFormatClass(TextOutputFormat.class);
>>
>>
>>
>>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>>
>>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>>
>>
>>
>>
>>
>>                 job.submit();
>>
>>
>>
>>                 job.waitForCompletion(true);
>>
>>
>>
>>
>>
>>                 }
>>
>>
>>
>> }
>>
>>
>>
>>
>>
>>
>>
>> Regards,
>>
>> Chhaya Vishwakarma
>>
>>
>>
>> ------------------------------
>> The contents of this e-mail and any attachment(s) may contain
>> confidential or privileged information for the intended recipient(s).
>> Unintended recipients are prohibited from taking action on the basis of
>> information in this e-mail and using or disseminating the information, and
>> must notify the sender and delete it from their system. L&T Infotech will
>> not accept responsibility or liability for the accuracy or completeness of,
>> or the presence of any virus or disabling code in this e-mail"
>>
>
>
>
> --
>
>
> Thanks
> Devaraj K
>

Re: XML parsing in Hadoop

Posted by Adam Kawa <ka...@gmail.com>.

Alternatively you can try an input format called WholeFileInputFormat
(nicely explained in "Hadoop: The Definitive Guide" by Tom White), where
you process a whole file as a record in a single map() method. Refer to a
book, for a code example.


2013/11/28 Devaraj K <de...@apache.org>

> Hi,
>
> Here this map() function will be called for every (key,value) pair (i.e.
> for every line of split in your Job because of TextInputFormat). This xml
> parsing code which you have written in map() function will be executed for
> every line of your input which is causing the problem.
>
> You can customize your InputFormat to read the xml file, instead of
> parsing in map() or you could place this parsing code in run() method by
> overriding it from Mapper.run(Context context).
>
>
> On Thu, Nov 28, 2013 at 12:15 PM, Chhaya Vishwakarma <
> Chhaya.Vishwakarma@lntinfotech.com> wrote:
>
>>  Hi,
>>
>>
>>
>>
>>
>> The below code parses XML file, Here the output of the code is correct
>> but the job takes long time for completion.
>>
>> It took 20 hours to parse 2MB file.
>>
>> Kindly suggest what changes could be done to increase the performance.
>>
>>
>>
>>
>>
>>
>>
>> package xml;
>>
>>
>>
>> import java.io.FileInputStream;
>>
>> import java.io.FileNotFoundException;
>>
>> import java.io.IOException;
>>
>> import java.util.*;
>>
>>
>>
>> import javax.xml.parsers.DocumentBuilder;
>>
>> import javax.xml.parsers.DocumentBuilderFactory;
>>
>> import javax.xml.parsers.ParserConfigurationException;
>>
>> import javax.xml.xpath.XPath;
>>
>> import javax.xml.xpath.XPathConstants;
>>
>> import javax.xml.xpath.XPathExpressionException;
>>
>> import javax.xml.xpath.XPathFactory;
>>
>>
>>
>> import org.apache.hadoop.fs.FSDataInputStream;
>>
>> import org.apache.hadoop.fs.FSInputStream;
>>
>> import org.apache.hadoop.fs.FileSystem;
>>
>> import org.apache.hadoop.fs.Path;
>>
>>
>>
>> import org.apache.hadoop.conf.*;
>>
>> import org.apache.hadoop.io.*;
>>
>>
>>
>> import org.apache.hadoop.mapred.JobConf;
>>
>> import org.apache.hadoop.mapreduce.*;
>>
>> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>>
>> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>>
>>
>>
>>
>>
>> import org.apache.log4j.Logger;
>>
>> import org.w3c.dom.Document;
>>
>> import org.w3c.dom.Element;
>>
>> import org.w3c.dom.NodeList;
>>
>> import org.xml.sax.SAXException;
>>
>>
>>
>>
>>
>> public class ReadXmlMR
>>
>> {
>>
>>                 static Logger log =
>> Logger.getLogger(ReadXmlMR.class.getName());
>>
>>                  public static String fileName = new String();
>>
>>                  public static Document dom;
>>
>>                  public void configure(JobConf job) {
>>
>>          fileName = job.get("map.input.file");
>>
>> }
>>
>>
>>
>>
>>
>>                 public static class Map extends
>> Mapper<LongWritable,Text,Text,Text>
>>
>>                {
>>
>>
>>
>>                                 public void map(LongWritable key, Text
>> value,Context context ) throws IOException, InterruptedException
>>
>>                                 {
>>
>>                                                 try {
>>
>>                                                                 FileSplit
>> fileSplit = (FileSplit)context.getInputSplit();
>>
>>
>> Configuration conf = context.getConfiguration();
>>
>>
>>
>>
>> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>>
>>
>>
>>
>> FSDataInputStream fstream1;
>>
>>                                                                 Path file
>> = fileSplit.getPath();
>>
>>                                                 FileSystem fs =
>> file.getFileSystem(conf);
>>
>>                                                 fstream1 =
>> fs.open(fileSplit.getPath());
>>
>>
>> DocumentBuilder db = dbf.newDocumentBuilder();
>>
>>                                                                 dom =
>> db.parse(fstream1);
>>
>>                                                                 Element
>> docEle = null;
>>
>>                                                                 docEle =
>> dom.getDocumentElement();
>>
>>
>>
>>                                                                 XPath
>> xpath = XPathFactory.newInstance().newXPath();
>>
>>
>>
>>                                                                 Object
>> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>>
>>
>>
>>                                                                 NodeList
>> nodes = (NodeList) result;
>>
>>
>>
>>
>>
>>                                                                 for (int
>> n = 2; n < nodes.getLength(); n++)
>>
>>
>>
>>                                                                 {
>>
>>
>> Text colvalue=new Text("");
>>
>>
>> Text nodename= new Text("");
>>
>>
>>
>>
>> nodename = new Text(nodes.item(n).getNodeName());
>>
>>
>> try{colvalue = new
>> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>>
>>
>> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>>
>>
>> context.write(nodename, colvalue);
>>
>>                                                                 }
>>
>>
>>
>>
>>
>>                                                                 } catch
>> (ParserConfigurationException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>                                                                 } catch
>> (SAXException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>
>>
>>                                                                 } catch
>> (XPathExpressionException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>                                                                 }
>>
>>
>>
>>                                                                 }
>>
>>
>>
>>                                 }
>>
>>
>>
>>
>>
>>
>>
>>                 public static void main(String[] args) throws Exception
>>
>>
>>
>>                 {
>>
>>
>>
>>                 Configuration conf = new Configuration();
>>
>>
>>
>>         Job job = new Job(conf, "XmlParsing");
>>
>>         job.setJarByClass(ReadXmlMR.class);
>>
>>                 job.setOutputKeyClass(Text.class);
>>
>>                 job.setOutputValueClass(Text.class);
>>
>>
>>
>>
>>
>>                 job.setMapperClass(Map.class);
>>
>>
>>
>>
>>
>>                 job.setInputFormatClass(TextInputFormat.class);
>>
>>                 job.setOutputFormatClass(TextOutputFormat.class);
>>
>>
>>
>>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>>
>>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>>
>>
>>
>>
>>
>>                 job.submit();
>>
>>
>>
>>                 job.waitForCompletion(true);
>>
>>
>>
>>
>>
>>                 }
>>
>>
>>
>> }
>>
>>
>>
>>
>>
>>
>>
>> Regards,
>>
>> Chhaya Vishwakarma
>>
>>
>>
>> ------------------------------
>> The contents of this e-mail and any attachment(s) may contain
>> confidential or privileged information for the intended recipient(s).
>> Unintended recipients are prohibited from taking action on the basis of
>> information in this e-mail and using or disseminating the information, and
>> must notify the sender and delete it from their system. L&T Infotech will
>> not accept responsibility or liability for the accuracy or completeness of,
>> or the presence of any virus or disabling code in this e-mail"
>>
>
>
>
> --
>
>
> Thanks
> Devaraj K
>

Re: XML parsing in Hadoop

Posted by Adam Kawa <ka...@gmail.com>.

Alternatively you can try an input format called WholeFileInputFormat
(nicely explained in "Hadoop: The Definitive Guide" by Tom White), where
you process a whole file as a record in a single map() method. Refer to a
book, for a code example.


2013/11/28 Devaraj K <de...@apache.org>

> Hi,
>
> Here this map() function will be called for every (key,value) pair (i.e.
> for every line of split in your Job because of TextInputFormat). This xml
> parsing code which you have written in map() function will be executed for
> every line of your input which is causing the problem.
>
> You can customize your InputFormat to read the xml file, instead of
> parsing in map() or you could place this parsing code in run() method by
> overriding it from Mapper.run(Context context).
>
>
> On Thu, Nov 28, 2013 at 12:15 PM, Chhaya Vishwakarma <
> Chhaya.Vishwakarma@lntinfotech.com> wrote:
>
>>  Hi,
>>
>>
>>
>>
>>
>> The below code parses XML file, Here the output of the code is correct
>> but the job takes long time for completion.
>>
>> It took 20 hours to parse 2MB file.
>>
>> Kindly suggest what changes could be done to increase the performance.
>>
>>
>>
>>
>>
>>
>>
>> package xml;
>>
>>
>>
>> import java.io.FileInputStream;
>>
>> import java.io.FileNotFoundException;
>>
>> import java.io.IOException;
>>
>> import java.util.*;
>>
>>
>>
>> import javax.xml.parsers.DocumentBuilder;
>>
>> import javax.xml.parsers.DocumentBuilderFactory;
>>
>> import javax.xml.parsers.ParserConfigurationException;
>>
>> import javax.xml.xpath.XPath;
>>
>> import javax.xml.xpath.XPathConstants;
>>
>> import javax.xml.xpath.XPathExpressionException;
>>
>> import javax.xml.xpath.XPathFactory;
>>
>>
>>
>> import org.apache.hadoop.fs.FSDataInputStream;
>>
>> import org.apache.hadoop.fs.FSInputStream;
>>
>> import org.apache.hadoop.fs.FileSystem;
>>
>> import org.apache.hadoop.fs.Path;
>>
>>
>>
>> import org.apache.hadoop.conf.*;
>>
>> import org.apache.hadoop.io.*;
>>
>>
>>
>> import org.apache.hadoop.mapred.JobConf;
>>
>> import org.apache.hadoop.mapreduce.*;
>>
>> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>>
>> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>>
>> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>>
>>
>>
>>
>>
>> import org.apache.log4j.Logger;
>>
>> import org.w3c.dom.Document;
>>
>> import org.w3c.dom.Element;
>>
>> import org.w3c.dom.NodeList;
>>
>> import org.xml.sax.SAXException;
>>
>>
>>
>>
>>
>> public class ReadXmlMR
>>
>> {
>>
>>                 static Logger log =
>> Logger.getLogger(ReadXmlMR.class.getName());
>>
>>                  public static String fileName = new String();
>>
>>                  public static Document dom;
>>
>>                  public void configure(JobConf job) {
>>
>>          fileName = job.get("map.input.file");
>>
>> }
>>
>>
>>
>>
>>
>>                 public static class Map extends
>> Mapper<LongWritable,Text,Text,Text>
>>
>>                {
>>
>>
>>
>>                                 public void map(LongWritable key, Text
>> value,Context context ) throws IOException, InterruptedException
>>
>>                                 {
>>
>>                                                 try {
>>
>>                                                                 FileSplit
>> fileSplit = (FileSplit)context.getInputSplit();
>>
>>
>> Configuration conf = context.getConfiguration();
>>
>>
>>
>>
>> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>>
>>
>>
>>
>> FSDataInputStream fstream1;
>>
>>                                                                 Path file
>> = fileSplit.getPath();
>>
>>                                                 FileSystem fs =
>> file.getFileSystem(conf);
>>
>>                                                 fstream1 =
>> fs.open(fileSplit.getPath());
>>
>>
>> DocumentBuilder db = dbf.newDocumentBuilder();
>>
>>                                                                 dom =
>> db.parse(fstream1);
>>
>>                                                                 Element
>> docEle = null;
>>
>>                                                                 docEle =
>> dom.getDocumentElement();
>>
>>
>>
>>                                                                 XPath
>> xpath = XPathFactory.newInstance().newXPath();
>>
>>
>>
>>                                                                 Object
>> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>>
>>
>>
>>                                                                 NodeList
>> nodes = (NodeList) result;
>>
>>
>>
>>
>>
>>                                                                 for (int
>> n = 2; n < nodes.getLength(); n++)
>>
>>
>>
>>                                                                 {
>>
>>
>> Text colvalue=new Text("");
>>
>>
>> Text nodename= new Text("");
>>
>>
>>
>>
>> nodename = new Text(nodes.item(n).getNodeName());
>>
>>
>> try{colvalue = new
>> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>>
>>
>> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>>
>>
>> context.write(nodename, colvalue);
>>
>>                                                                 }
>>
>>
>>
>>
>>
>>                                                                 } catch
>> (ParserConfigurationException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>                                                                 } catch
>> (SAXException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>
>>
>>                                                                 } catch
>> (XPathExpressionException e) {
>>
>>                                                                 // TODO
>> Auto-generated catch block
>>
>>
>> e.printStackTrace();
>>
>>                                                                 }
>>
>>
>>
>>                                                                 }
>>
>>
>>
>>                                 }
>>
>>
>>
>>
>>
>>
>>
>>                 public static void main(String[] args) throws Exception
>>
>>
>>
>>                 {
>>
>>
>>
>>                 Configuration conf = new Configuration();
>>
>>
>>
>>         Job job = new Job(conf, "XmlParsing");
>>
>>         job.setJarByClass(ReadXmlMR.class);
>>
>>                 job.setOutputKeyClass(Text.class);
>>
>>                 job.setOutputValueClass(Text.class);
>>
>>
>>
>>
>>
>>                 job.setMapperClass(Map.class);
>>
>>
>>
>>
>>
>>                 job.setInputFormatClass(TextInputFormat.class);
>>
>>                 job.setOutputFormatClass(TextOutputFormat.class);
>>
>>
>>
>>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>>
>>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>>
>>
>>
>>
>>
>>                 job.submit();
>>
>>
>>
>>                 job.waitForCompletion(true);
>>
>>
>>
>>
>>
>>                 }
>>
>>
>>
>> }
>>
>>
>>
>>
>>
>>
>>
>> Regards,
>>
>> Chhaya Vishwakarma
>>
>>
>>
>> ------------------------------
>> The contents of this e-mail and any attachment(s) may contain
>> confidential or privileged information for the intended recipient(s).
>> Unintended recipients are prohibited from taking action on the basis of
>> information in this e-mail and using or disseminating the information, and
>> must notify the sender and delete it from their system. L&T Infotech will
>> not accept responsibility or liability for the accuracy or completeness of,
>> or the presence of any virus or disabling code in this e-mail"
>>
>
>
>
> --
>
>
> Thanks
> Devaraj K
>

Re: XML parsing in Hadoop

Posted by Devaraj K <de...@apache.org>.

Hi,

Here this map() function will be called for every (key,value) pair (i.e.
for every line of split in your Job because of TextInputFormat). This xml
parsing code which you have written in map() function will be executed for
every line of your input which is causing the problem.

You can customize your InputFormat to read the xml file, instead of parsing
in map() or you could place this parsing code in run() method by overriding
it from Mapper.run(Context context).


On Thu, Nov 28, 2013 at 12:15 PM, Chhaya Vishwakarma <
Chhaya.Vishwakarma@lntinfotech.com> wrote:

>  Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
> ------------------------------
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>



-- 


Thanks
Devaraj K

Re: XML parsing in Hadoop

Posted by Devaraj K <de...@apache.org>.

Hi,

Here this map() function will be called for every (key,value) pair (i.e.
for every line of split in your Job because of TextInputFormat). This xml
parsing code which you have written in map() function will be executed for
every line of your input which is causing the problem.

You can customize your InputFormat to read the xml file, instead of parsing
in map() or you could place this parsing code in run() method by overriding
it from Mapper.run(Context context).


On Thu, Nov 28, 2013 at 12:15 PM, Chhaya Vishwakarma <
Chhaya.Vishwakarma@lntinfotech.com> wrote:

>  Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
> ------------------------------
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>



-- 


Thanks
Devaraj K

RE: XML parsing in Hadoop

Posted by Chhaya Vishwakarma <Ch...@lntinfotech.com>.

Hi
Thank you all finally I am able to do it


Regards,
Chhaya Vishwakarma



From: Sofia Georgiakaki [mailto:geosofie_tuc@yahoo.com]
Sent: Thursday, November 28, 2013 2:58 PM
To: user@hadoop.apache.org
Subject: Re: XML parsing in Hadoop

Hello Chhaya,

I'm not sure why the job launches 4 map tasks, since your input file's size is 2MB, which is less than 1 HDFS block (64MB by default) - I would expect to initialize only 1 mapper, unless you have changed the default HDFS block size value.

As I see in the code, you use TextInputFormat.class to read your input file. This means that your map function will be executed once per line of your input. However, inside your map function you still read all the input split:
FileSplit fileSplit = (FileSplit)context.getInputSplit();
This means that if you have many lines in your input (I guess you do), you read multiple time the same input split, which I suspect is wrong?
Moreover, you might want to revise the line
if ( colvalue.toString().equalsIgnoreCase(null) ) .
Do you mean
if ( colvalue==null) ?

I think it would be helpful to read once more the MapReduce programming model, in order to better understand when each map & reduce function is executed and how. You can use this link  http://developer.yahoo.com/hadoop/tutorial/module4.html , or the official Apache Hadoop website.
This will help you fit your algorithm in the MapReduce paradigm more easily. If you need further clarifications, I would be happy to help!

Regards,
Sofia


On Thursday, November 28, 2013 11:03 AM, Chhaya Vishwakarma <Ch...@lntinfotech.com>> wrote:
2mB file

From: unmesha sreeveni [mailto:unmeshabiju@gmail.com]
Sent: Thursday, November 28, 2013 2:23 PM
To: User Hadoop
Subject: Re: XML parsing in Hadoop

How much is ur size of input file?

On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <Ch...@lntinfotech.com>> wrote:
Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com<ma...@gmail.com>]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org<ma...@hadoop.apache.org>
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"




--
Thanks & Regards

Unmesha Sreeveni U.B
Junior Developer

RE: XML parsing in Hadoop

Posted by Chhaya Vishwakarma <Ch...@lntinfotech.com>.

Hi
Thank you all finally I am able to do it


Regards,
Chhaya Vishwakarma



From: Sofia Georgiakaki [mailto:geosofie_tuc@yahoo.com]
Sent: Thursday, November 28, 2013 2:58 PM
To: user@hadoop.apache.org
Subject: Re: XML parsing in Hadoop

Hello Chhaya,

I'm not sure why the job launches 4 map tasks, since your input file's size is 2MB, which is less than 1 HDFS block (64MB by default) - I would expect to initialize only 1 mapper, unless you have changed the default HDFS block size value.

As I see in the code, you use TextInputFormat.class to read your input file. This means that your map function will be executed once per line of your input. However, inside your map function you still read all the input split:
FileSplit fileSplit = (FileSplit)context.getInputSplit();
This means that if you have many lines in your input (I guess you do), you read multiple time the same input split, which I suspect is wrong?
Moreover, you might want to revise the line
if ( colvalue.toString().equalsIgnoreCase(null) ) .
Do you mean
if ( colvalue==null) ?

I think it would be helpful to read once more the MapReduce programming model, in order to better understand when each map & reduce function is executed and how. You can use this link  http://developer.yahoo.com/hadoop/tutorial/module4.html , or the official Apache Hadoop website.
This will help you fit your algorithm in the MapReduce paradigm more easily. If you need further clarifications, I would be happy to help!

Regards,
Sofia


On Thursday, November 28, 2013 11:03 AM, Chhaya Vishwakarma <Ch...@lntinfotech.com>> wrote:
2mB file

From: unmesha sreeveni [mailto:unmeshabiju@gmail.com]
Sent: Thursday, November 28, 2013 2:23 PM
To: User Hadoop
Subject: Re: XML parsing in Hadoop

How much is ur size of input file?

On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <Ch...@lntinfotech.com>> wrote:
Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com<ma...@gmail.com>]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org<ma...@hadoop.apache.org>
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"




--
Thanks & Regards

Unmesha Sreeveni U.B
Junior Developer

RE: XML parsing in Hadoop

Posted by Chhaya Vishwakarma <Ch...@lntinfotech.com>.

Hi
Thank you all finally I am able to do it


Regards,
Chhaya Vishwakarma



From: Sofia Georgiakaki [mailto:geosofie_tuc@yahoo.com]
Sent: Thursday, November 28, 2013 2:58 PM
To: user@hadoop.apache.org
Subject: Re: XML parsing in Hadoop

Hello Chhaya,

I'm not sure why the job launches 4 map tasks, since your input file's size is 2MB, which is less than 1 HDFS block (64MB by default) - I would expect to initialize only 1 mapper, unless you have changed the default HDFS block size value.

As I see in the code, you use TextInputFormat.class to read your input file. This means that your map function will be executed once per line of your input. However, inside your map function you still read all the input split:
FileSplit fileSplit = (FileSplit)context.getInputSplit();
This means that if you have many lines in your input (I guess you do), you read multiple time the same input split, which I suspect is wrong?
Moreover, you might want to revise the line
if ( colvalue.toString().equalsIgnoreCase(null) ) .
Do you mean
if ( colvalue==null) ?

I think it would be helpful to read once more the MapReduce programming model, in order to better understand when each map & reduce function is executed and how. You can use this link  http://developer.yahoo.com/hadoop/tutorial/module4.html , or the official Apache Hadoop website.
This will help you fit your algorithm in the MapReduce paradigm more easily. If you need further clarifications, I would be happy to help!

Regards,
Sofia


On Thursday, November 28, 2013 11:03 AM, Chhaya Vishwakarma <Ch...@lntinfotech.com>> wrote:
2mB file

From: unmesha sreeveni [mailto:unmeshabiju@gmail.com]
Sent: Thursday, November 28, 2013 2:23 PM
To: User Hadoop
Subject: Re: XML parsing in Hadoop

How much is ur size of input file?

On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <Ch...@lntinfotech.com>> wrote:
Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com<ma...@gmail.com>]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org<ma...@hadoop.apache.org>
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"




--
Thanks & Regards

Unmesha Sreeveni U.B
Junior Developer

RE: XML parsing in Hadoop

Posted by Chhaya Vishwakarma <Ch...@lntinfotech.com>.

Hi
Thank you all finally I am able to do it


Regards,
Chhaya Vishwakarma



From: Sofia Georgiakaki [mailto:geosofie_tuc@yahoo.com]
Sent: Thursday, November 28, 2013 2:58 PM
To: user@hadoop.apache.org
Subject: Re: XML parsing in Hadoop

Hello Chhaya,

I'm not sure why the job launches 4 map tasks, since your input file's size is 2MB, which is less than 1 HDFS block (64MB by default) - I would expect to initialize only 1 mapper, unless you have changed the default HDFS block size value.

As I see in the code, you use TextInputFormat.class to read your input file. This means that your map function will be executed once per line of your input. However, inside your map function you still read all the input split:
FileSplit fileSplit = (FileSplit)context.getInputSplit();
This means that if you have many lines in your input (I guess you do), you read multiple time the same input split, which I suspect is wrong?
Moreover, you might want to revise the line
if ( colvalue.toString().equalsIgnoreCase(null) ) .
Do you mean
if ( colvalue==null) ?

I think it would be helpful to read once more the MapReduce programming model, in order to better understand when each map & reduce function is executed and how. You can use this link  http://developer.yahoo.com/hadoop/tutorial/module4.html , or the official Apache Hadoop website.
This will help you fit your algorithm in the MapReduce paradigm more easily. If you need further clarifications, I would be happy to help!

Regards,
Sofia


On Thursday, November 28, 2013 11:03 AM, Chhaya Vishwakarma <Ch...@lntinfotech.com>> wrote:
2mB file

From: unmesha sreeveni [mailto:unmeshabiju@gmail.com]
Sent: Thursday, November 28, 2013 2:23 PM
To: User Hadoop
Subject: Re: XML parsing in Hadoop

How much is ur size of input file?

On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <Ch...@lntinfotech.com>> wrote:
Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com<ma...@gmail.com>]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org<ma...@hadoop.apache.org>
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"




--
Thanks & Regards

Unmesha Sreeveni U.B
Junior Developer

Re: XML parsing in Hadoop

Posted by Sofia Georgiakaki <ge...@yahoo.com>.

Hello Chhaya,

I'm not sure why the job launches 4 map tasks, since your input file's size is 2MB, which is less than 1 HDFS block (64MB by default) - I would expect to initialize only 1 mapper, unless you have changed the default HDFS block size value.

As I see in the code, you use TextInputFormat.class to read your input file. This means that your map function will be executed once per line of your input. However, inside your map function you still read all the input split:
FileSplit fileSplit = (FileSplit)context.getInputSplit();
This means that if you have many lines in your input (I guess you do), you read multiple time the same input split, which I suspect is wrong?
Moreover, you might want to revise the line
if ( colvalue.toString().equalsIgnoreCase(null) ) .
Do you mean
if ( colvalue==null) ?

I think it would be helpful to read once more the MapReduce programming model, in order to better understand when each map & reduce function is executed and how. You can use this link  http://developer.yahoo.com/hadoop/tutorial/module4.html , or the official Apache Hadoop website.
This will help you fit your algorithm in the MapReduce paradigm more easily. If you need further clarifications, I would be happy to help!

Regards,
Sofia





On Thursday, November 28, 2013 11:03 AM, Chhaya Vishwakarma <Ch...@lntinfotech.com> wrote:
 
2mB file
> 
>From:unmesha sreeveni [mailto:unmeshabiju@gmail.com] 
>Sent: Thursday, November 28, 2013 2:23 PM
>To: User Hadoop
>Subject: Re: XML parsing in Hadoop
> 
>How much is ur size of input file?
> 
>On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <Ch...@lntinfotech.com> wrote:
>Hi,
> 
>Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
>I have a single node cluster its launching 4 map tasks. Trying with only one file.
> 
> 
>Regards,
>Chhaya Vishwakarma
> 
> 
> 
>From:Mirko Kämpf [mailto:mirko.kaempf@gmail.com] 
>Sent: Thursday, November 28, 2013 12:53 PM
>To: user@hadoop.apache.org
>Subject: Re: XML parsing in Hadoop
> 
>Chhaya,
> 
>did you run the same code in stand alone mode without MapReduce framework?
>How long takes the code in you map() function standalone? 
>Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out 
>if it is a MR issue or something which comes from the xml-parser logic or the data ...
> 
>Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?
> 
>Best wishes
>Mirko
> 
> 
>2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>
>Hi,
> 
> 
>The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
>It took 20 hours to parse 2MB file.
>Kindly suggest what changes could be done to increase the performance.
> 
> 
> 
>package xml;
> 
>import java.io.FileInputStream;
>import java.io.FileNotFoundException;
>import java.io.IOException;
>import java.util.*;
> 
>import javax.xml.parsers.DocumentBuilder;
>import javax.xml.parsers.DocumentBuilderFactory;
>import javax.xml.parsers.ParserConfigurationException;
>import javax.xml.xpath.XPath;
>import javax.xml.xpath.XPathConstants;
>import javax.xml.xpath.XPathExpressionException;
>import javax.xml.xpath.XPathFactory;
>        
>import org.apache.hadoop.fs.FSDataInputStream;
>import org.apache.hadoop.fs.FSInputStream;
>import org.apache.hadoop.fs.FileSystem;
>import org.apache.hadoop.fs.Path;
> 
>import org.apache.hadoop.conf.*;
>import org.apache.hadoop.io.*;
> 
>import org.apache.hadoop.mapred.JobConf;
>import org.apache.hadoop.mapreduce.*;
>import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
> 
> 
>import org.apache.log4j.Logger;
>import org.w3c.dom.Document;
>import org.w3c.dom.Element;
>import org.w3c.dom.NodeList;
>import org.xml.sax.SAXException;
> 
> 
>public class ReadXmlMR 
>{
>                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
>                 public static String fileName = new String();
>                 public static Document dom;
>                 public void configure(JobConf job) {
>         fileName = job.get("map.input.file");
>}
> 
>   
>                public static class Map extends Mapper<LongWritable,Text,Text,Text>
>               { 
>                                
>                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException  
>                                { 
>                                                try {
>                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
>                                                                Configuration conf = context.getConfiguration();
>                                                                
>                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>                                                
>                                                                FSDataInputStream fstream1;
>                                                                Path file = fileSplit.getPath();
>                                                FileSystem fs = file.getFileSystem(conf);
>                                                fstream1 = fs.open(fileSplit.getPath());
>                                                                DocumentBuilder db = dbf.newDocumentBuilder();
>                                                                dom = db.parse(fstream1);
>                                                                Element docEle = null;
>                                                                docEle = dom.getDocumentElement();
>                                                
>                                                                XPath xpath = XPathFactory.newInstance().newXPath();
>                                                
>                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
> 
>                                                                NodeList nodes = (NodeList) result;
>                                                
>                                                
>                                                                for (int n = 2; n < nodes.getLength(); n++) 
>                                                                
>                                                                {
>                                                                                Text colvalue=new Text("");
>                                                                                Text nodename= new Text("");
> 
>                                                                                nodename = new Text(nodes.item(n).getNodeName());
>                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>                                                                                context.write(nodename, colvalue);      
>                                                                }
>                                                
> 
>                                                                } catch (ParserConfigurationException e) {
>                                                                // TODO Auto-generated catch block
>                                                                e.printStackTrace();
>                                                                } catch (SAXException e) {
>                                                                // TODO Auto-generated catch block
>                                                                e.printStackTrace();
>                                
>                                                                } catch (XPathExpressionException e) {
>                                                                // TODO Auto-generated catch block
>                                                                e.printStackTrace();
>                                                                }
>                                                
>                                                                }
>                
>                                }
>                
>                                      
>                
>                public static void main(String[] args) throws Exception
>                
>                {
>                
>                Configuration conf = new Configuration();
>                   
>        Job job = new Job(conf, "XmlParsing");
>        job.setJarByClass(ReadXmlMR.class);
>                job.setOutputKeyClass(Text.class);
>                job.setOutputValueClass(Text.class);
> 
>         
>                job.setMapperClass(Map.class);
>   
>        
>                job.setInputFormatClass(TextInputFormat.class);
>                job.setOutputFormatClass(TextOutputFormat.class);
>        
>                FileInputFormat.addInputPath(job, new Path(args[0]));
>                FileOutputFormat.setOutputPath(job, new Path(args[1]));
>       
>    
>                job.submit();
>        
>                job.waitForCompletion(true);
>                                
> 
>                }
> 
>}
> 
> 
> 
>Regards,
>Chhaya Vishwakarma
> 
> 
>
>________________________________
>
>The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"
> 
>
>
>
> 
>-- 
>Thanks & Regards 
> 
>Unmesha Sreeveni U.B
>Junior Developer
> 
> 
>
>

Re: XML parsing in Hadoop

Posted by Sofia Georgiakaki <ge...@yahoo.com>.

Hello Chhaya,

I'm not sure why the job launches 4 map tasks, since your input file's size is 2MB, which is less than 1 HDFS block (64MB by default) - I would expect to initialize only 1 mapper, unless you have changed the default HDFS block size value.

As I see in the code, you use TextInputFormat.class to read your input file. This means that your map function will be executed once per line of your input. However, inside your map function you still read all the input split:
FileSplit fileSplit = (FileSplit)context.getInputSplit();
This means that if you have many lines in your input (I guess you do), you read multiple time the same input split, which I suspect is wrong?
Moreover, you might want to revise the line
if ( colvalue.toString().equalsIgnoreCase(null) ) .
Do you mean
if ( colvalue==null) ?

I think it would be helpful to read once more the MapReduce programming model, in order to better understand when each map & reduce function is executed and how. You can use this link  http://developer.yahoo.com/hadoop/tutorial/module4.html , or the official Apache Hadoop website.
This will help you fit your algorithm in the MapReduce paradigm more easily. If you need further clarifications, I would be happy to help!

Regards,
Sofia





On Thursday, November 28, 2013 11:03 AM, Chhaya Vishwakarma <Ch...@lntinfotech.com> wrote:
 
2mB file
> 
>From:unmesha sreeveni [mailto:unmeshabiju@gmail.com] 
>Sent: Thursday, November 28, 2013 2:23 PM
>To: User Hadoop
>Subject: Re: XML parsing in Hadoop
> 
>How much is ur size of input file?
> 
>On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <Ch...@lntinfotech.com> wrote:
>Hi,
> 
>Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
>I have a single node cluster its launching 4 map tasks. Trying with only one file.
> 
> 
>Regards,
>Chhaya Vishwakarma
> 
> 
> 
>From:Mirko Kämpf [mailto:mirko.kaempf@gmail.com] 
>Sent: Thursday, November 28, 2013 12:53 PM
>To: user@hadoop.apache.org
>Subject: Re: XML parsing in Hadoop
> 
>Chhaya,
> 
>did you run the same code in stand alone mode without MapReduce framework?
>How long takes the code in you map() function standalone? 
>Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out 
>if it is a MR issue or something which comes from the xml-parser logic or the data ...
> 
>Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?
> 
>Best wishes
>Mirko
> 
> 
>2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>
>Hi,
> 
> 
>The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
>It took 20 hours to parse 2MB file.
>Kindly suggest what changes could be done to increase the performance.
> 
> 
> 
>package xml;
> 
>import java.io.FileInputStream;
>import java.io.FileNotFoundException;
>import java.io.IOException;
>import java.util.*;
> 
>import javax.xml.parsers.DocumentBuilder;
>import javax.xml.parsers.DocumentBuilderFactory;
>import javax.xml.parsers.ParserConfigurationException;
>import javax.xml.xpath.XPath;
>import javax.xml.xpath.XPathConstants;
>import javax.xml.xpath.XPathExpressionException;
>import javax.xml.xpath.XPathFactory;
>        
>import org.apache.hadoop.fs.FSDataInputStream;
>import org.apache.hadoop.fs.FSInputStream;
>import org.apache.hadoop.fs.FileSystem;
>import org.apache.hadoop.fs.Path;
> 
>import org.apache.hadoop.conf.*;
>import org.apache.hadoop.io.*;
> 
>import org.apache.hadoop.mapred.JobConf;
>import org.apache.hadoop.mapreduce.*;
>import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
> 
> 
>import org.apache.log4j.Logger;
>import org.w3c.dom.Document;
>import org.w3c.dom.Element;
>import org.w3c.dom.NodeList;
>import org.xml.sax.SAXException;
> 
> 
>public class ReadXmlMR 
>{
>                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
>                 public static String fileName = new String();
>                 public static Document dom;
>                 public void configure(JobConf job) {
>         fileName = job.get("map.input.file");
>}
> 
>   
>                public static class Map extends Mapper<LongWritable,Text,Text,Text>
>               { 
>                                
>                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException  
>                                { 
>                                                try {
>                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
>                                                                Configuration conf = context.getConfiguration();
>                                                                
>                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>                                                
>                                                                FSDataInputStream fstream1;
>                                                                Path file = fileSplit.getPath();
>                                                FileSystem fs = file.getFileSystem(conf);
>                                                fstream1 = fs.open(fileSplit.getPath());
>                                                                DocumentBuilder db = dbf.newDocumentBuilder();
>                                                                dom = db.parse(fstream1);
>                                                                Element docEle = null;
>                                                                docEle = dom.getDocumentElement();
>                                                
>                                                                XPath xpath = XPathFactory.newInstance().newXPath();
>                                                
>                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
> 
>                                                                NodeList nodes = (NodeList) result;
>                                                
>                                                
>                                                                for (int n = 2; n < nodes.getLength(); n++) 
>                                                                
>                                                                {
>                                                                                Text colvalue=new Text("");
>                                                                                Text nodename= new Text("");
> 
>                                                                                nodename = new Text(nodes.item(n).getNodeName());
>                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>                                                                                context.write(nodename, colvalue);      
>                                                                }
>                                                
> 
>                                                                } catch (ParserConfigurationException e) {
>                                                                // TODO Auto-generated catch block
>                                                                e.printStackTrace();
>                                                                } catch (SAXException e) {
>                                                                // TODO Auto-generated catch block
>                                                                e.printStackTrace();
>                                
>                                                                } catch (XPathExpressionException e) {
>                                                                // TODO Auto-generated catch block
>                                                                e.printStackTrace();
>                                                                }
>                                                
>                                                                }
>                
>                                }
>                
>                                      
>                
>                public static void main(String[] args) throws Exception
>                
>                {
>                
>                Configuration conf = new Configuration();
>                   
>        Job job = new Job(conf, "XmlParsing");
>        job.setJarByClass(ReadXmlMR.class);
>                job.setOutputKeyClass(Text.class);
>                job.setOutputValueClass(Text.class);
> 
>         
>                job.setMapperClass(Map.class);
>   
>        
>                job.setInputFormatClass(TextInputFormat.class);
>                job.setOutputFormatClass(TextOutputFormat.class);
>        
>                FileInputFormat.addInputPath(job, new Path(args[0]));
>                FileOutputFormat.setOutputPath(job, new Path(args[1]));
>       
>    
>                job.submit();
>        
>                job.waitForCompletion(true);
>                                
> 
>                }
> 
>}
> 
> 
> 
>Regards,
>Chhaya Vishwakarma
> 
> 
>
>________________________________
>
>The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"
> 
>
>
>
> 
>-- 
>Thanks & Regards 
> 
>Unmesha Sreeveni U.B
>Junior Developer
> 
> 
>
>

Re: XML parsing in Hadoop

Posted by Sofia Georgiakaki <ge...@yahoo.com>.

Hello Chhaya,

I'm not sure why the job launches 4 map tasks, since your input file's size is 2MB, which is less than 1 HDFS block (64MB by default) - I would expect to initialize only 1 mapper, unless you have changed the default HDFS block size value.

As I see in the code, you use TextInputFormat.class to read your input file. This means that your map function will be executed once per line of your input. However, inside your map function you still read all the input split:
FileSplit fileSplit = (FileSplit)context.getInputSplit();
This means that if you have many lines in your input (I guess you do), you read multiple time the same input split, which I suspect is wrong?
Moreover, you might want to revise the line
if ( colvalue.toString().equalsIgnoreCase(null) ) .
Do you mean
if ( colvalue==null) ?

I think it would be helpful to read once more the MapReduce programming model, in order to better understand when each map & reduce function is executed and how. You can use this link  http://developer.yahoo.com/hadoop/tutorial/module4.html , or the official Apache Hadoop website.
This will help you fit your algorithm in the MapReduce paradigm more easily. If you need further clarifications, I would be happy to help!

Regards,
Sofia





On Thursday, November 28, 2013 11:03 AM, Chhaya Vishwakarma <Ch...@lntinfotech.com> wrote:
 
2mB file
> 
>From:unmesha sreeveni [mailto:unmeshabiju@gmail.com] 
>Sent: Thursday, November 28, 2013 2:23 PM
>To: User Hadoop
>Subject: Re: XML parsing in Hadoop
> 
>How much is ur size of input file?
> 
>On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <Ch...@lntinfotech.com> wrote:
>Hi,
> 
>Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
>I have a single node cluster its launching 4 map tasks. Trying with only one file.
> 
> 
>Regards,
>Chhaya Vishwakarma
> 
> 
> 
>From:Mirko Kämpf [mailto:mirko.kaempf@gmail.com] 
>Sent: Thursday, November 28, 2013 12:53 PM
>To: user@hadoop.apache.org
>Subject: Re: XML parsing in Hadoop
> 
>Chhaya,
> 
>did you run the same code in stand alone mode without MapReduce framework?
>How long takes the code in you map() function standalone? 
>Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out 
>if it is a MR issue or something which comes from the xml-parser logic or the data ...
> 
>Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?
> 
>Best wishes
>Mirko
> 
> 
>2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>
>Hi,
> 
> 
>The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
>It took 20 hours to parse 2MB file.
>Kindly suggest what changes could be done to increase the performance.
> 
> 
> 
>package xml;
> 
>import java.io.FileInputStream;
>import java.io.FileNotFoundException;
>import java.io.IOException;
>import java.util.*;
> 
>import javax.xml.parsers.DocumentBuilder;
>import javax.xml.parsers.DocumentBuilderFactory;
>import javax.xml.parsers.ParserConfigurationException;
>import javax.xml.xpath.XPath;
>import javax.xml.xpath.XPathConstants;
>import javax.xml.xpath.XPathExpressionException;
>import javax.xml.xpath.XPathFactory;
>        
>import org.apache.hadoop.fs.FSDataInputStream;
>import org.apache.hadoop.fs.FSInputStream;
>import org.apache.hadoop.fs.FileSystem;
>import org.apache.hadoop.fs.Path;
> 
>import org.apache.hadoop.conf.*;
>import org.apache.hadoop.io.*;
> 
>import org.apache.hadoop.mapred.JobConf;
>import org.apache.hadoop.mapreduce.*;
>import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
> 
> 
>import org.apache.log4j.Logger;
>import org.w3c.dom.Document;
>import org.w3c.dom.Element;
>import org.w3c.dom.NodeList;
>import org.xml.sax.SAXException;
> 
> 
>public class ReadXmlMR 
>{
>                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
>                 public static String fileName = new String();
>                 public static Document dom;
>                 public void configure(JobConf job) {
>         fileName = job.get("map.input.file");
>}
> 
>   
>                public static class Map extends Mapper<LongWritable,Text,Text,Text>
>               { 
>                                
>                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException  
>                                { 
>                                                try {
>                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
>                                                                Configuration conf = context.getConfiguration();
>                                                                
>                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>                                                
>                                                                FSDataInputStream fstream1;
>                                                                Path file = fileSplit.getPath();
>                                                FileSystem fs = file.getFileSystem(conf);
>                                                fstream1 = fs.open(fileSplit.getPath());
>                                                                DocumentBuilder db = dbf.newDocumentBuilder();
>                                                                dom = db.parse(fstream1);
>                                                                Element docEle = null;
>                                                                docEle = dom.getDocumentElement();
>                                                
>                                                                XPath xpath = XPathFactory.newInstance().newXPath();
>                                                
>                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
> 
>                                                                NodeList nodes = (NodeList) result;
>                                                
>                                                
>                                                                for (int n = 2; n < nodes.getLength(); n++) 
>                                                                
>                                                                {
>                                                                                Text colvalue=new Text("");
>                                                                                Text nodename= new Text("");
> 
>                                                                                nodename = new Text(nodes.item(n).getNodeName());
>                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>                                                                                context.write(nodename, colvalue);      
>                                                                }
>                                                
> 
>                                                                } catch (ParserConfigurationException e) {
>                                                                // TODO Auto-generated catch block
>                                                                e.printStackTrace();
>                                                                } catch (SAXException e) {
>                                                                // TODO Auto-generated catch block
>                                                                e.printStackTrace();
>                                
>                                                                } catch (XPathExpressionException e) {
>                                                                // TODO Auto-generated catch block
>                                                                e.printStackTrace();
>                                                                }
>                                                
>                                                                }
>                
>                                }
>                
>                                      
>                
>                public static void main(String[] args) throws Exception
>                
>                {
>                
>                Configuration conf = new Configuration();
>                   
>        Job job = new Job(conf, "XmlParsing");
>        job.setJarByClass(ReadXmlMR.class);
>                job.setOutputKeyClass(Text.class);
>                job.setOutputValueClass(Text.class);
> 
>         
>                job.setMapperClass(Map.class);
>   
>        
>                job.setInputFormatClass(TextInputFormat.class);
>                job.setOutputFormatClass(TextOutputFormat.class);
>        
>                FileInputFormat.addInputPath(job, new Path(args[0]));
>                FileOutputFormat.setOutputPath(job, new Path(args[1]));
>       
>    
>                job.submit();
>        
>                job.waitForCompletion(true);
>                                
> 
>                }
> 
>}
> 
> 
> 
>Regards,
>Chhaya Vishwakarma
> 
> 
>
>________________________________
>
>The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"
> 
>
>
>
> 
>-- 
>Thanks & Regards 
> 
>Unmesha Sreeveni U.B
>Junior Developer
> 
> 
>
>

Re: XML parsing in Hadoop

Posted by Sofia Georgiakaki <ge...@yahoo.com>.

Hello Chhaya,

I'm not sure why the job launches 4 map tasks, since your input file's size is 2MB, which is less than 1 HDFS block (64MB by default) - I would expect to initialize only 1 mapper, unless you have changed the default HDFS block size value.

As I see in the code, you use TextInputFormat.class to read your input file. This means that your map function will be executed once per line of your input. However, inside your map function you still read all the input split:
FileSplit fileSplit = (FileSplit)context.getInputSplit();
This means that if you have many lines in your input (I guess you do), you read multiple time the same input split, which I suspect is wrong?
Moreover, you might want to revise the line
if ( colvalue.toString().equalsIgnoreCase(null) ) .
Do you mean
if ( colvalue==null) ?

I think it would be helpful to read once more the MapReduce programming model, in order to better understand when each map & reduce function is executed and how. You can use this link  http://developer.yahoo.com/hadoop/tutorial/module4.html , or the official Apache Hadoop website.
This will help you fit your algorithm in the MapReduce paradigm more easily. If you need further clarifications, I would be happy to help!

Regards,
Sofia





On Thursday, November 28, 2013 11:03 AM, Chhaya Vishwakarma <Ch...@lntinfotech.com> wrote:
 
2mB file
> 
>From:unmesha sreeveni [mailto:unmeshabiju@gmail.com] 
>Sent: Thursday, November 28, 2013 2:23 PM
>To: User Hadoop
>Subject: Re: XML parsing in Hadoop
> 
>How much is ur size of input file?
> 
>On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <Ch...@lntinfotech.com> wrote:
>Hi,
> 
>Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
>I have a single node cluster its launching 4 map tasks. Trying with only one file.
> 
> 
>Regards,
>Chhaya Vishwakarma
> 
> 
> 
>From:Mirko Kämpf [mailto:mirko.kaempf@gmail.com] 
>Sent: Thursday, November 28, 2013 12:53 PM
>To: user@hadoop.apache.org
>Subject: Re: XML parsing in Hadoop
> 
>Chhaya,
> 
>did you run the same code in stand alone mode without MapReduce framework?
>How long takes the code in you map() function standalone? 
>Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out 
>if it is a MR issue or something which comes from the xml-parser logic or the data ...
> 
>Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?
> 
>Best wishes
>Mirko
> 
> 
>2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>
>Hi,
> 
> 
>The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
>It took 20 hours to parse 2MB file.
>Kindly suggest what changes could be done to increase the performance.
> 
> 
> 
>package xml;
> 
>import java.io.FileInputStream;
>import java.io.FileNotFoundException;
>import java.io.IOException;
>import java.util.*;
> 
>import javax.xml.parsers.DocumentBuilder;
>import javax.xml.parsers.DocumentBuilderFactory;
>import javax.xml.parsers.ParserConfigurationException;
>import javax.xml.xpath.XPath;
>import javax.xml.xpath.XPathConstants;
>import javax.xml.xpath.XPathExpressionException;
>import javax.xml.xpath.XPathFactory;
>        
>import org.apache.hadoop.fs.FSDataInputStream;
>import org.apache.hadoop.fs.FSInputStream;
>import org.apache.hadoop.fs.FileSystem;
>import org.apache.hadoop.fs.Path;
> 
>import org.apache.hadoop.conf.*;
>import org.apache.hadoop.io.*;
> 
>import org.apache.hadoop.mapred.JobConf;
>import org.apache.hadoop.mapreduce.*;
>import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
> 
> 
>import org.apache.log4j.Logger;
>import org.w3c.dom.Document;
>import org.w3c.dom.Element;
>import org.w3c.dom.NodeList;
>import org.xml.sax.SAXException;
> 
> 
>public class ReadXmlMR 
>{
>                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
>                 public static String fileName = new String();
>                 public static Document dom;
>                 public void configure(JobConf job) {
>         fileName = job.get("map.input.file");
>}
> 
>   
>                public static class Map extends Mapper<LongWritable,Text,Text,Text>
>               { 
>                                
>                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException  
>                                { 
>                                                try {
>                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
>                                                                Configuration conf = context.getConfiguration();
>                                                                
>                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>                                                
>                                                                FSDataInputStream fstream1;
>                                                                Path file = fileSplit.getPath();
>                                                FileSystem fs = file.getFileSystem(conf);
>                                                fstream1 = fs.open(fileSplit.getPath());
>                                                                DocumentBuilder db = dbf.newDocumentBuilder();
>                                                                dom = db.parse(fstream1);
>                                                                Element docEle = null;
>                                                                docEle = dom.getDocumentElement();
>                                                
>                                                                XPath xpath = XPathFactory.newInstance().newXPath();
>                                                
>                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
> 
>                                                                NodeList nodes = (NodeList) result;
>                                                
>                                                
>                                                                for (int n = 2; n < nodes.getLength(); n++) 
>                                                                
>                                                                {
>                                                                                Text colvalue=new Text("");
>                                                                                Text nodename= new Text("");
> 
>                                                                                nodename = new Text(nodes.item(n).getNodeName());
>                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>                                                                                context.write(nodename, colvalue);      
>                                                                }
>                                                
> 
>                                                                } catch (ParserConfigurationException e) {
>                                                                // TODO Auto-generated catch block
>                                                                e.printStackTrace();
>                                                                } catch (SAXException e) {
>                                                                // TODO Auto-generated catch block
>                                                                e.printStackTrace();
>                                
>                                                                } catch (XPathExpressionException e) {
>                                                                // TODO Auto-generated catch block
>                                                                e.printStackTrace();
>                                                                }
>                                                
>                                                                }
>                
>                                }
>                
>                                      
>                
>                public static void main(String[] args) throws Exception
>                
>                {
>                
>                Configuration conf = new Configuration();
>                   
>        Job job = new Job(conf, "XmlParsing");
>        job.setJarByClass(ReadXmlMR.class);
>                job.setOutputKeyClass(Text.class);
>                job.setOutputValueClass(Text.class);
> 
>         
>                job.setMapperClass(Map.class);
>   
>        
>                job.setInputFormatClass(TextInputFormat.class);
>                job.setOutputFormatClass(TextOutputFormat.class);
>        
>                FileInputFormat.addInputPath(job, new Path(args[0]));
>                FileOutputFormat.setOutputPath(job, new Path(args[1]));
>       
>    
>                job.submit();
>        
>                job.waitForCompletion(true);
>                                
> 
>                }
> 
>}
> 
> 
> 
>Regards,
>Chhaya Vishwakarma
> 
> 
>
>________________________________
>
>The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"
> 
>
>
>
> 
>-- 
>Thanks & Regards 
> 
>Unmesha Sreeveni U.B
>Junior Developer
> 
> 
>
>

RE: XML parsing in Hadoop

Posted by Chhaya Vishwakarma <Ch...@lntinfotech.com>.

2mB file

From: unmesha sreeveni [mailto:unmeshabiju@gmail.com]
Sent: Thursday, November 28, 2013 2:23 PM
To: User Hadoop
Subject: Re: XML parsing in Hadoop

How much is ur size of input file?

On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <Ch...@lntinfotech.com>> wrote:
Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com<ma...@gmail.com>]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org<ma...@hadoop.apache.org>
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"




--
Thanks & Regards

Unmesha Sreeveni U.B
Junior Developer

RE: XML parsing in Hadoop

Posted by Chhaya Vishwakarma <Ch...@lntinfotech.com>.

2mB file

From: unmesha sreeveni [mailto:unmeshabiju@gmail.com]
Sent: Thursday, November 28, 2013 2:23 PM
To: User Hadoop
Subject: Re: XML parsing in Hadoop

How much is ur size of input file?

On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <Ch...@lntinfotech.com>> wrote:
Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com<ma...@gmail.com>]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org<ma...@hadoop.apache.org>
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"




--
Thanks & Regards

Unmesha Sreeveni U.B
Junior Developer

RE: XML parsing in Hadoop

Posted by Chhaya Vishwakarma <Ch...@lntinfotech.com>.

2mB file

From: unmesha sreeveni [mailto:unmeshabiju@gmail.com]
Sent: Thursday, November 28, 2013 2:23 PM
To: User Hadoop
Subject: Re: XML parsing in Hadoop

How much is ur size of input file?

On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <Ch...@lntinfotech.com>> wrote:
Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com<ma...@gmail.com>]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org<ma...@hadoop.apache.org>
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"




--
Thanks & Regards

Unmesha Sreeveni U.B
Junior Developer

RE: XML parsing in Hadoop

Posted by Chhaya Vishwakarma <Ch...@lntinfotech.com>.

2mB file

From: unmesha sreeveni [mailto:unmeshabiju@gmail.com]
Sent: Thursday, November 28, 2013 2:23 PM
To: User Hadoop
Subject: Re: XML parsing in Hadoop

How much is ur size of input file?

On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <Ch...@lntinfotech.com>> wrote:
Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com<ma...@gmail.com>]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org<ma...@hadoop.apache.org>
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"




--
Thanks & Regards

Unmesha Sreeveni U.B
Junior Developer

Re: XML parsing in Hadoop

Posted by unmesha sreeveni <un...@gmail.com>.

How much is ur size of input file?


On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <
Chhaya.Vishwakarma@lntinfotech.com> wrote:

> Hi,
>
>
>
> Yes I have run it without MR it takes few seconds to run. So I think its
> MR issue only
>
> I have a single node cluster its launching 4 map tasks. Trying with only
> one file.
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
>
>
>
>
> *From:* Mirko Kämpf [mailto:mirko.kaempf@gmail.com]
> *Sent:* Thursday, November 28, 2013 12:53 PM
> *To:* user@hadoop.apache.org
> *Subject:* Re: XML parsing in Hadoop
>
>
>
> Chhaya,
>
>
>
> did you run the same code in stand alone mode without MapReduce framework?
>
> How long takes the code in you map() function standalone?
>
> Compare those two different times (t_0 MR mode, t_1 standalone mode) to
> find out
>
> if it is a MR issue or something which comes from the xml-parser logic or
> the data ...
>
>
>
> Usually it should be not that slow. But what cluster do you have and how
> many mappers / reducers and how many of such 2NB files do you have?
>
>
>
> Best wishes
>
> Mirko
>
>
>
>
>
> 2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>
>
> Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
>
> ------------------------------
>
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>
>
>



-- 
*Thanks & Regards*

Unmesha Sreeveni U.B

*Junior Developer*

Re: XML parsing in Hadoop

Posted by unmesha sreeveni <un...@gmail.com>.

How much is ur size of input file?


On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <
Chhaya.Vishwakarma@lntinfotech.com> wrote:

> Hi,
>
>
>
> Yes I have run it without MR it takes few seconds to run. So I think its
> MR issue only
>
> I have a single node cluster its launching 4 map tasks. Trying with only
> one file.
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
>
>
>
>
> *From:* Mirko Kämpf [mailto:mirko.kaempf@gmail.com]
> *Sent:* Thursday, November 28, 2013 12:53 PM
> *To:* user@hadoop.apache.org
> *Subject:* Re: XML parsing in Hadoop
>
>
>
> Chhaya,
>
>
>
> did you run the same code in stand alone mode without MapReduce framework?
>
> How long takes the code in you map() function standalone?
>
> Compare those two different times (t_0 MR mode, t_1 standalone mode) to
> find out
>
> if it is a MR issue or something which comes from the xml-parser logic or
> the data ...
>
>
>
> Usually it should be not that slow. But what cluster do you have and how
> many mappers / reducers and how many of such 2NB files do you have?
>
>
>
> Best wishes
>
> Mirko
>
>
>
>
>
> 2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>
>
> Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
>
> ------------------------------
>
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>
>
>



-- 
*Thanks & Regards*

Unmesha Sreeveni U.B

*Junior Developer*

RE: XML parsing in Hadoop

Posted by Vinayakumar B <vi...@huawei.com>.

Hi Chhaya,
When I see your mapreduce Job, you are still using the TextInputFormat, which reads the input file line by line and executes map() method.

You are actually doing the following things.


1.       Takes XML files as in input file.

2.       For each line of the XML file, MapReduce will call map() method.

3.       In your map() method, you are parsing the entire file and storing node key and value to output.

That means suppose if your XML have 1000 lines, then 1000 times same XML file will be parsed.
This is the reason your Job is taking lot of time.

You may need to write the custom input format to identify your input.

Thanks and Regards,
Vinay

From: Chhaya Vishwakarma [mailto:Chhaya.Vishwakarma@lntinfotech.com]
Sent: 28 November 2013 14:18
To: user@hadoop.apache.org; mirko.kaempf@gmail.com
Subject: RE: XML parsing in Hadoop

Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org<ma...@hadoop.apache.org>
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"

RE: XML parsing in Hadoop

Posted by Vinayakumar B <vi...@huawei.com>.

Hi Chhaya,
When I see your mapreduce Job, you are still using the TextInputFormat, which reads the input file line by line and executes map() method.

You are actually doing the following things.


1.       Takes XML files as in input file.

2.       For each line of the XML file, MapReduce will call map() method.

3.       In your map() method, you are parsing the entire file and storing node key and value to output.

That means suppose if your XML have 1000 lines, then 1000 times same XML file will be parsed.
This is the reason your Job is taking lot of time.

You may need to write the custom input format to identify your input.

Thanks and Regards,
Vinay

From: Chhaya Vishwakarma [mailto:Chhaya.Vishwakarma@lntinfotech.com]
Sent: 28 November 2013 14:18
To: user@hadoop.apache.org; mirko.kaempf@gmail.com
Subject: RE: XML parsing in Hadoop

Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org<ma...@hadoop.apache.org>
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"

Re: XML parsing in Hadoop

Posted by unmesha sreeveni <un...@gmail.com>.

How much is ur size of input file?


On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <
Chhaya.Vishwakarma@lntinfotech.com> wrote:

> Hi,
>
>
>
> Yes I have run it without MR it takes few seconds to run. So I think its
> MR issue only
>
> I have a single node cluster its launching 4 map tasks. Trying with only
> one file.
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
>
>
>
>
> *From:* Mirko Kämpf [mailto:mirko.kaempf@gmail.com]
> *Sent:* Thursday, November 28, 2013 12:53 PM
> *To:* user@hadoop.apache.org
> *Subject:* Re: XML parsing in Hadoop
>
>
>
> Chhaya,
>
>
>
> did you run the same code in stand alone mode without MapReduce framework?
>
> How long takes the code in you map() function standalone?
>
> Compare those two different times (t_0 MR mode, t_1 standalone mode) to
> find out
>
> if it is a MR issue or something which comes from the xml-parser logic or
> the data ...
>
>
>
> Usually it should be not that slow. But what cluster do you have and how
> many mappers / reducers and how many of such 2NB files do you have?
>
>
>
> Best wishes
>
> Mirko
>
>
>
>
>
> 2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>
>
> Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
>
> ------------------------------
>
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>
>
>



-- 
*Thanks & Regards*

Unmesha Sreeveni U.B

*Junior Developer*

Re: XML parsing in Hadoop

Posted by unmesha sreeveni <un...@gmail.com>.

How much is ur size of input file?


On Thu, Nov 28, 2013 at 2:17 PM, Chhaya Vishwakarma <
Chhaya.Vishwakarma@lntinfotech.com> wrote:

> Hi,
>
>
>
> Yes I have run it without MR it takes few seconds to run. So I think its
> MR issue only
>
> I have a single node cluster its launching 4 map tasks. Trying with only
> one file.
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
>
>
>
>
> *From:* Mirko Kämpf [mailto:mirko.kaempf@gmail.com]
> *Sent:* Thursday, November 28, 2013 12:53 PM
> *To:* user@hadoop.apache.org
> *Subject:* Re: XML parsing in Hadoop
>
>
>
> Chhaya,
>
>
>
> did you run the same code in stand alone mode without MapReduce framework?
>
> How long takes the code in you map() function standalone?
>
> Compare those two different times (t_0 MR mode, t_1 standalone mode) to
> find out
>
> if it is a MR issue or something which comes from the xml-parser logic or
> the data ...
>
>
>
> Usually it should be not that slow. But what cluster do you have and how
> many mappers / reducers and how many of such 2NB files do you have?
>
>
>
> Best wishes
>
> Mirko
>
>
>
>
>
> 2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>
>
> Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
>
> ------------------------------
>
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>
>
>



-- 
*Thanks & Regards*

Unmesha Sreeveni U.B

*Junior Developer*

RE: XML parsing in Hadoop

Posted by Vinayakumar B <vi...@huawei.com>.

Hi Chhaya,
When I see your mapreduce Job, you are still using the TextInputFormat, which reads the input file line by line and executes map() method.

You are actually doing the following things.


1.       Takes XML files as in input file.

2.       For each line of the XML file, MapReduce will call map() method.

3.       In your map() method, you are parsing the entire file and storing node key and value to output.

That means suppose if your XML have 1000 lines, then 1000 times same XML file will be parsed.
This is the reason your Job is taking lot of time.

You may need to write the custom input format to identify your input.

Thanks and Regards,
Vinay

From: Chhaya Vishwakarma [mailto:Chhaya.Vishwakarma@lntinfotech.com]
Sent: 28 November 2013 14:18
To: user@hadoop.apache.org; mirko.kaempf@gmail.com
Subject: RE: XML parsing in Hadoop

Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org<ma...@hadoop.apache.org>
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"

RE: XML parsing in Hadoop

Posted by Vinayakumar B <vi...@huawei.com>.

Hi Chhaya,
When I see your mapreduce Job, you are still using the TextInputFormat, which reads the input file line by line and executes map() method.

You are actually doing the following things.


1.       Takes XML files as in input file.

2.       For each line of the XML file, MapReduce will call map() method.

3.       In your map() method, you are parsing the entire file and storing node key and value to output.

That means suppose if your XML have 1000 lines, then 1000 times same XML file will be parsed.
This is the reason your Job is taking lot of time.

You may need to write the custom input format to identify your input.

Thanks and Regards,
Vinay

From: Chhaya Vishwakarma [mailto:Chhaya.Vishwakarma@lntinfotech.com]
Sent: 28 November 2013 14:18
To: user@hadoop.apache.org; mirko.kaempf@gmail.com
Subject: RE: XML parsing in Hadoop

Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org<ma...@hadoop.apache.org>
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"

RE: XML parsing in Hadoop

Posted by Chhaya Vishwakarma <Ch...@lntinfotech.com>.

Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"

RE: XML parsing in Hadoop

Posted by Chhaya Vishwakarma <Ch...@lntinfotech.com>.

Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"

RE: XML parsing in Hadoop

Posted by Chhaya Vishwakarma <Ch...@lntinfotech.com>.

Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"

RE: XML parsing in Hadoop

Posted by Chhaya Vishwakarma <Ch...@lntinfotech.com>.

Hi,

Yes I have run it without MR it takes few seconds to run. So I think its MR issue only
I have a single node cluster its launching 4 map tasks. Trying with only one file.


Regards,
Chhaya Vishwakarma



From: Mirko Kämpf [mailto:mirko.kaempf@gmail.com]
Sent: Thursday, November 28, 2013 12:53 PM
To: user@hadoop.apache.org
Subject: Re: XML parsing in Hadoop

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to find out
if it is a MR issue or something which comes from the xml-parser logic or the data ...

Usually it should be not that slow. But what cluster do you have and how many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko


2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>>
Hi,


The below code parses XML file, Here the output of the code is correct but the job takes long time for completion.
It took 20 hours to parse 2MB file.
Kindly suggest what changes could be done to increase the performance.



package xml;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;

import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;


public class ReadXmlMR
{
                static Logger log = Logger.getLogger(ReadXmlMR.class.getName());
                 public static String fileName = new String();
                 public static Document dom;
                 public void configure(JobConf job) {
         fileName = job.get("map.input.file");
}


                public static class Map extends Mapper<LongWritable,Text,Text,Text>
               {

                                public void map(LongWritable key, Text value,Context context ) throws IOException, InterruptedException
                                {
                                                try {
                                                                FileSplit fileSplit = (FileSplit)context.getInputSplit();
                                                                Configuration conf = context.getConfiguration();

                                                                DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

                                                                FSDataInputStream fstream1;
                                                                Path file = fileSplit.getPath();
                                                FileSystem fs = file.getFileSystem(conf);
                                                fstream1 = fs.open(fileSplit.getPath());
                                                                DocumentBuilder db = dbf.newDocumentBuilder();
                                                                dom = db.parse(fstream1);
                                                                Element docEle = null;
                                                                docEle = dom.getDocumentElement();

                                                                XPath xpath = XPathFactory.newInstance().newXPath();

                                                                Object result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);

                                                                NodeList nodes = (NodeList) result;


                                                                for (int n = 2; n < nodes.getLength(); n++)

                                                                {
                                                                                Text colvalue=new Text("");
                                                                                Text nodename= new Text("");

                                                                                nodename = new Text(nodes.item(n).getNodeName());
                                                                                try{colvalue = new Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
                                                                                if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
                                                                                context.write(nodename, colvalue);
                                                                }


                                                                } catch (ParserConfigurationException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                } catch (SAXException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();

                                                                } catch (XPathExpressionException e) {
                                                                // TODO Auto-generated catch block
                                                                e.printStackTrace();
                                                                }

                                                                }

                                }



                public static void main(String[] args) throws Exception

                {

                Configuration conf = new Configuration();

        Job job = new Job(conf, "XmlParsing");
        job.setJarByClass(ReadXmlMR.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);


                job.setMapperClass(Map.class);


                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));


                job.submit();

                job.waitForCompletion(true);


                }

}



Regards,
Chhaya Vishwakarma


________________________________
The contents of this e-mail and any attachment(s) may contain confidential or privileged information for the intended recipient(s). Unintended recipients are prohibited from taking action on the basis of information in this e-mail and using or disseminating the information, and must notify the sender and delete it from their system. L&T Infotech will not accept responsibility or liability for the accuracy or completeness of, or the presence of any virus or disabling code in this e-mail"

Re: XML parsing in Hadoop

Posted by Mirko Kämpf <mi...@gmail.com>.

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to
find out
if it is a MR issue or something which comes from the xml-parser logic or
the data ...

Usually it should be not that slow. But what cluster do you have and how
many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko



2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>

>  Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
> ------------------------------
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>

Re: XML parsing in Hadoop

Posted by Mirko Kämpf <mi...@gmail.com>.

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to
find out
if it is a MR issue or something which comes from the xml-parser logic or
the data ...

Usually it should be not that slow. But what cluster do you have and how
many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko



2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>

>  Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
> ------------------------------
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>

Re: XML parsing in Hadoop

Posted by Devaraj K <de...@apache.org>.

Hi,

Here this map() function will be called for every (key,value) pair (i.e.
for every line of split in your Job because of TextInputFormat). This xml
parsing code which you have written in map() function will be executed for
every line of your input which is causing the problem.

You can customize your InputFormat to read the xml file, instead of parsing
in map() or you could place this parsing code in run() method by overriding
it from Mapper.run(Context context).


On Thu, Nov 28, 2013 at 12:15 PM, Chhaya Vishwakarma <
Chhaya.Vishwakarma@lntinfotech.com> wrote:

>  Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
> ------------------------------
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>



-- 


Thanks
Devaraj K

Re: XML parsing in Hadoop

Posted by Mirko Kämpf <mi...@gmail.com>.

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to
find out
if it is a MR issue or something which comes from the xml-parser logic or
the data ...

Usually it should be not that slow. But what cluster do you have and how
many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko



2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>

>  Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
> ------------------------------
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>

Re: XML parsing in Hadoop

Posted by Devaraj K <de...@apache.org>.

Hi,

Here this map() function will be called for every (key,value) pair (i.e.
for every line of split in your Job because of TextInputFormat). This xml
parsing code which you have written in map() function will be executed for
every line of your input which is causing the problem.

You can customize your InputFormat to read the xml file, instead of parsing
in map() or you could place this parsing code in run() method by overriding
it from Mapper.run(Context context).


On Thu, Nov 28, 2013 at 12:15 PM, Chhaya Vishwakarma <
Chhaya.Vishwakarma@lntinfotech.com> wrote:

>  Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
> ------------------------------
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>



-- 


Thanks
Devaraj K

Re: XML parsing in Hadoop

Posted by Mirko Kämpf <mi...@gmail.com>.

Chhaya,

did you run the same code in stand alone mode without MapReduce framework?
How long takes the code in you map() function standalone?
Compare those two different times (t_0 MR mode, t_1 standalone mode) to
find out
if it is a MR issue or something which comes from the xml-parser logic or
the data ...

Usually it should be not that slow. But what cluster do you have and how
many mappers / reducers and how many of such 2NB files do you have?

Best wishes
Mirko



2013/11/28 Chhaya Vishwakarma <Ch...@lntinfotech.com>

>  Hi,
>
>
>
>
>
> The below code parses XML file, Here the output of the code is correct but
> the job takes long time for completion.
>
> It took 20 hours to parse 2MB file.
>
> Kindly suggest what changes could be done to increase the performance.
>
>
>
>
>
>
>
> package xml;
>
>
>
> import java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
>
> import java.io.IOException;
>
> import java.util.*;
>
>
>
> import javax.xml.parsers.DocumentBuilder;
>
> import javax.xml.parsers.DocumentBuilderFactory;
>
> import javax.xml.parsers.ParserConfigurationException;
>
> import javax.xml.xpath.XPath;
>
> import javax.xml.xpath.XPathConstants;
>
> import javax.xml.xpath.XPathExpressionException;
>
> import javax.xml.xpath.XPathFactory;
>
>
>
> import org.apache.hadoop.fs.FSDataInputStream;
>
> import org.apache.hadoop.fs.FSInputStream;
>
> import org.apache.hadoop.fs.FileSystem;
>
> import org.apache.hadoop.fs.Path;
>
>
>
> import org.apache.hadoop.conf.*;
>
> import org.apache.hadoop.io.*;
>
>
>
> import org.apache.hadoop.mapred.JobConf;
>
> import org.apache.hadoop.mapreduce.*;
>
> import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.input.FileSplit;
>
> import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
>
> import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
>
>
>
>
>
> import org.apache.log4j.Logger;
>
> import org.w3c.dom.Document;
>
> import org.w3c.dom.Element;
>
> import org.w3c.dom.NodeList;
>
> import org.xml.sax.SAXException;
>
>
>
>
>
> public class ReadXmlMR
>
> {
>
>                 static Logger log =
> Logger.getLogger(ReadXmlMR.class.getName());
>
>                  public static String fileName = new String();
>
>                  public static Document dom;
>
>                  public void configure(JobConf job) {
>
>          fileName = job.get("map.input.file");
>
> }
>
>
>
>
>
>                 public static class Map extends
> Mapper<LongWritable,Text,Text,Text>
>
>                {
>
>
>
>                                 public void map(LongWritable key, Text
> value,Context context ) throws IOException, InterruptedException
>
>                                 {
>
>                                                 try {
>
>                                                                 FileSplit
> fileSplit = (FileSplit)context.getInputSplit();
>
>
> Configuration conf = context.getConfiguration();
>
>
>
>
> DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
>
>
>
>
> FSDataInputStream fstream1;
>
>                                                                 Path file
> = fileSplit.getPath();
>
>                                                 FileSystem fs =
> file.getFileSystem(conf);
>
>                                                 fstream1 =
> fs.open(fileSplit.getPath());
>
>
> DocumentBuilder db = dbf.newDocumentBuilder();
>
>                                                                 dom =
> db.parse(fstream1);
>
>                                                                 Element
> docEle = null;
>
>                                                                 docEle =
> dom.getDocumentElement();
>
>
>
>                                                                 XPath
> xpath = XPathFactory.newInstance().newXPath();
>
>
>
>                                                                 Object
> result =  xpath.compile("//*").evaluate(dom, XPathConstants.NODESET);
>
>
>
>                                                                 NodeList
> nodes = (NodeList) result;
>
>
>
>
>
>                                                                 for (int n
> = 2; n < nodes.getLength(); n++)
>
>
>
>                                                                 {
>
>
> Text colvalue=new Text("");
>
>
> Text nodename= new Text("");
>
>
>
>
> nodename = new Text(nodes.item(n).getNodeName());
>
>
> try{colvalue = new
> Text(nodes.item(n).getFirstChild().getNodeValue());}catch(Exception e){}
>
>
> if(colvalue.toString().equalsIgnoreCase(null)){colvalue=new Text("");}
>
>
> context.write(nodename, colvalue);
>
>                                                                 }
>
>
>
>
>
>                                                                 } catch
> (ParserConfigurationException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 } catch
> (SAXException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>
>
>                                                                 } catch
> (XPathExpressionException e) {
>
>                                                                 // TODO
> Auto-generated catch block
>
>
> e.printStackTrace();
>
>                                                                 }
>
>
>
>                                                                 }
>
>
>
>                                 }
>
>
>
>
>
>
>
>                 public static void main(String[] args) throws Exception
>
>
>
>                 {
>
>
>
>                 Configuration conf = new Configuration();
>
>
>
>         Job job = new Job(conf, "XmlParsing");
>
>         job.setJarByClass(ReadXmlMR.class);
>
>                 job.setOutputKeyClass(Text.class);
>
>                 job.setOutputValueClass(Text.class);
>
>
>
>
>
>                 job.setMapperClass(Map.class);
>
>
>
>
>
>                 job.setInputFormatClass(TextInputFormat.class);
>
>                 job.setOutputFormatClass(TextOutputFormat.class);
>
>
>
>                 FileInputFormat.addInputPath(job, new Path(args[0]));
>
>                 FileOutputFormat.setOutputPath(job, new Path(args[1]));
>
>
>
>
>
>                 job.submit();
>
>
>
>                 job.waitForCompletion(true);
>
>
>
>
>
>                 }
>
>
>
> }
>
>
>
>
>
>
>
> Regards,
>
> Chhaya Vishwakarma
>
>
>
> ------------------------------
> The contents of this e-mail and any attachment(s) may contain confidential
> or privileged information for the intended recipient(s). Unintended
> recipients are prohibited from taking action on the basis of information in
> this e-mail and using or disseminating the information, and must notify the
> sender and delete it from their system. L&T Infotech will not accept
> responsibility or liability for the accuracy or completeness of, or the
> presence of any virus or disabling code in this e-mail"
>