You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by Amna Waqar <am...@gmail.com> on 2011/02/12 07:25:02 UTC
getting java.lang.NullPointerException while indexing
hi everybody, i wrote a plugin named description which only index those
pages containg content-type meta-tag with value "text/html; charset=UTF-8"
package org.apache.nutch.parse.description;
// JDK imports
import java.util.Enumeration;
import java.util.Properties;
import java.util.logging.Logger;
import java.io.*;
// Nutch imports
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
// Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
// W3C imports
import org.w3c.dom.DocumentFragment;
public class DescriptionParser implements HtmlParseFilter {
private static final Log LOG =
LogFactory.getLog(DescriptionParser.class.getName());
private Configuration conf;
/** The Description meta data attribute name */
public static final String META_DESCRIPTION_NAME = "content-type";
/**
* Scan the HTML document looking for a description meta tag.
*/
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
// Trying to find the document's description tag
String desc = null;
Properties HttpMetaTags = metaTags.getHttpEquivTags() ;
for (Enumeration tagNames = HttpMetaTags.propertyNames();
tagNames.hasMoreElements(); ) {
if (tagNames.nextElement().equals("content-type")) {
desc = HttpMetaTags.getProperty("content-type");
if(desc == null)
LOG.info("No http-equiv tag for this page");
else if(desc.equals("")) {
LOG.info("Found an empty http-equiv tag");
} else {
LOG.info("Found an http-equiv tag; contents: " + desc);
}
}
}
if( desc.equals("text/html; charset=UTF-8") ) {
LOG.info("Adding http-equiv; contents: " + desc);
parse.getData().getContentMeta().set(META_DESCRIPTION_NAME, desc);
}
return parseResult;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
}
and DescriptionIndexer is
package org.apache.nutch.parse.description;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.*;
import org.apache.lucene.document.DateTools;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.hadoop.conf.Configuration;
public class DescriptionIndexer implements IndexingFilter {
public static final Log LOG =
LogFactory.getLog(DescriptionIndexer.class.getName());
private Configuration conf;
public DescriptionIndexer() {
}
public NutchDocument filter(NutchDocument doc, Parse parse, Text
url, CrawlDatum datum, Inlinks inlinks) {
String desc = parse.getData().getMeta("content-type");
if(!(desc.equals("text/html; charset=UTF-8")) ) {
return null;
}
return doc;
}
public void addIndexBackendOptions(Configuration conf) {
LuceneWriter.addFieldOptions("content-type", LuceneWriter.STORE.YES,
LuceneWriter.INDEX.NO, conf);
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
}
when i started to crawl
the following excetptions occur which shows that desc string is null
but i ve seed two urls one with content type "text/html; charset=UTF-8" and
other of text/html; charset=iso-8859-1
please help me to solve this problem
how can i see what is in desc during the crawl
Regards
Amna Waqar
Re: getting java.lang.NullPointerException while indexing
Posted by Markus Jelsma <ma...@openindex.io>.
Please include your exceptions.
On Saturday 12 February 2011 07:25:02 Amna Waqar wrote:
> hi everybody, i wrote a plugin named description which only index those
> pages containg content-type meta-tag with value "text/html; charset=UTF-8"
> package org.apache.nutch.parse.description;
>
> // JDK imports
> import java.util.Enumeration;
> import java.util.Properties;
> import java.util.logging.Logger;
> import java.io.*;
> // Nutch imports
> import org.apache.hadoop.conf.Configuration;
> import org.apache.nutch.parse.HTMLMetaTags;
> import org.apache.nutch.parse.Parse;
> import org.apache.nutch.parse.HtmlParseFilter;
> import org.apache.nutch.parse.ParseResult;
> import org.apache.nutch.protocol.Content;
>
> // Commons imports
> import org.apache.commons.logging.Log;
> import org.apache.commons.logging.LogFactory;
>
> // W3C imports
> import org.w3c.dom.DocumentFragment;
>
> public class DescriptionParser implements HtmlParseFilter {
>
> private static final Log LOG =
> LogFactory.getLog(DescriptionParser.class.getName());
>
> private Configuration conf;
>
> /** The Description meta data attribute name */
> public static final String META_DESCRIPTION_NAME = "content-type";
>
> /**
> * Scan the HTML document looking for a description meta tag.
> */
> public ParseResult filter(Content content, ParseResult parseResult,
> HTMLMetaTags metaTags, DocumentFragment doc) {
>
>
>
> Parse parse = parseResult.get(content.getUrl());
>
> // Trying to find the document's description tag
> String desc = null;
>
> Properties HttpMetaTags = metaTags.getHttpEquivTags() ;
>
> for (Enumeration tagNames = HttpMetaTags.propertyNames();
> tagNames.hasMoreElements(); ) {
> if (tagNames.nextElement().equals("content-type")) {
> desc = HttpMetaTags.getProperty("content-type");
> if(desc == null)
> LOG.info("No http-equiv tag for this page");
> else if(desc.equals("")) {
> LOG.info("Found an empty http-equiv tag");
> } else {
> LOG.info("Found an http-equiv tag; contents: " + desc);
> }
> }
> }
>
> if( desc.equals("text/html; charset=UTF-8") ) {
> LOG.info("Adding http-equiv; contents: " + desc);
> parse.getData().getContentMeta().set(META_DESCRIPTION_NAME, desc);
>
> }
>
>
> return parseResult;
> }
>
>
> public void setConf(Configuration conf) {
> this.conf = conf;
> }
>
> public Configuration getConf() {
> return this.conf;
> }
> }
> and DescriptionIndexer is
> package org.apache.nutch.parse.description;
>
> import org.apache.commons.logging.Log;
> import org.apache.commons.logging.LogFactory;
> import java.io.*;
> import org.apache.lucene.document.DateTools;
>
> import org.apache.nutch.metadata.Nutch;
> import org.apache.nutch.parse.Parse;
>
> import org.apache.nutch.indexer.IndexingFilter;
> import org.apache.nutch.indexer.IndexingException;
> import org.apache.nutch.indexer.NutchDocument;
> import org.apache.nutch.indexer.lucene.LuceneWriter;
> import org.apache.hadoop.io.Text;
>
> import org.apache.nutch.crawl.CrawlDatum;
> import org.apache.nutch.crawl.Inlinks;
>
> import java.net.MalformedURLException;
> import java.net.URL;
> import org.apache.hadoop.conf.Configuration;
>
>
> public class DescriptionIndexer implements IndexingFilter {
>
> public static final Log LOG =
> LogFactory.getLog(DescriptionIndexer.class.getName());
>
> private Configuration conf;
>
> public DescriptionIndexer() {
>
> }
>
> public NutchDocument filter(NutchDocument doc, Parse parse, Text
> url, CrawlDatum datum, Inlinks inlinks) {
>
> String desc = parse.getData().getMeta("content-type");
>
> if(!(desc.equals("text/html; charset=UTF-8")) ) {
> return null;
> }
>
> return doc;
> }
> public void addIndexBackendOptions(Configuration conf) {
> LuceneWriter.addFieldOptions("content-type", LuceneWriter.STORE.YES,
> LuceneWriter.INDEX.NO, conf);
>
> }
> public void setConf(Configuration conf) {
> this.conf = conf;
> }
>
> public Configuration getConf() {
> return this.conf;
> }
> }
>
> when i started to crawl
> the following excetptions occur which shows that desc string is null
> but i ve seed two urls one with content type "text/html; charset=UTF-8" and
> other of text/html; charset=iso-8859-1
> please help me to solve this problem
> how can i see what is in desc during the crawl
>
> Regards
> Amna Waqar
--
Markus Jelsma - CTO - Openindex
http://www.linkedin.com/in/markus17
050-8536620 / 06-50258350