You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by Amna Waqar <am...@gmail.com> on 2011/02/12 07:25:02 UTC

getting java.lang.NullPointerException while indexing

hi everybody, i wrote a plugin named description which only index those
pages containg content-type meta-tag with value "text/html; charset=UTF-8"
package org.apache.nutch.parse.description;

// JDK imports
import java.util.Enumeration;
import java.util.Properties;
import java.util.logging.Logger;
import java.io.*;
// Nutch imports
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;

// Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

// W3C imports
import org.w3c.dom.DocumentFragment;

public class DescriptionParser implements HtmlParseFilter {

  private static final Log LOG =
LogFactory.getLog(DescriptionParser.class.getName());

  private Configuration conf;

  /** The Description meta data attribute name */
  public static final String META_DESCRIPTION_NAME = "content-type";

  /**
   * Scan the HTML document looking for a description meta tag.
   */
  public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {



              Parse parse = parseResult.get(content.getUrl());

    // Trying to find the document's description tag
    String desc = null;

    Properties HttpMetaTags = metaTags.getHttpEquivTags() ;

    for (Enumeration tagNames = HttpMetaTags.propertyNames();
tagNames.hasMoreElements(); ) {
        if (tagNames.nextElement().equals("content-type")) {
           desc = HttpMetaTags.getProperty("content-type");
           if(desc == null)
                   LOG.info("No http-equiv tag for this page");
           else if(desc.equals("")) {
                   LOG.info("Found an empty http-equiv tag");
           } else {
                   LOG.info("Found an http-equiv tag; contents: " + desc);
           }
        }
    }

    if( desc.equals("text/html; charset=UTF-8") ) {
        LOG.info("Adding http-equiv; contents: " + desc);
       parse.getData().getContentMeta().set(META_DESCRIPTION_NAME, desc);

    }


    return parseResult;
  }


  public void setConf(Configuration conf) {
    this.conf = conf;
  }

  public Configuration getConf() {
    return this.conf;
  }
}
and DescriptionIndexer is
package org.apache.nutch.parse.description;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.*;
import org.apache.lucene.document.DateTools;

import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;

import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.hadoop.io.Text;

import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;

import java.net.MalformedURLException;
import java.net.URL;
import org.apache.hadoop.conf.Configuration;


public class DescriptionIndexer implements IndexingFilter {

        public static final Log LOG =
LogFactory.getLog(DescriptionIndexer.class.getName());

        private Configuration conf;

        public DescriptionIndexer() {

        }

        public NutchDocument filter(NutchDocument doc, Parse parse, Text
url, CrawlDatum datum, Inlinks inlinks)  {

                String desc = parse.getData().getMeta("content-type");

                if(!(desc.equals("text/html; charset=UTF-8")) ) {
                return null;
                }

                return doc;
        }
public void addIndexBackendOptions(Configuration conf) {
LuceneWriter.addFieldOptions("content-type", LuceneWriter.STORE.YES,
LuceneWriter.INDEX.NO, conf);

}
        public void setConf(Configuration conf) {
                this.conf = conf;
        }

        public Configuration getConf() {
                return this.conf;
        }
}

when i started to crawl
the following excetptions occur which shows that desc string is null
but i ve seed two urls one with content type "text/html; charset=UTF-8" and
other of text/html; charset=iso-8859-1
please help me to solve this problem
how can i see what is in desc during the crawl

Regards
Amna Waqar

Re: getting java.lang.NullPointerException while indexing

Posted by Markus Jelsma <ma...@openindex.io>.
Please include your exceptions.

On Saturday 12 February 2011 07:25:02 Amna Waqar wrote:
> hi everybody, i wrote a plugin named description which only index those
> pages containg content-type meta-tag with value "text/html; charset=UTF-8"
> package org.apache.nutch.parse.description;
> 
> // JDK imports
> import java.util.Enumeration;
> import java.util.Properties;
> import java.util.logging.Logger;
> import java.io.*;
> // Nutch imports
> import org.apache.hadoop.conf.Configuration;
> import org.apache.nutch.parse.HTMLMetaTags;
> import org.apache.nutch.parse.Parse;
> import org.apache.nutch.parse.HtmlParseFilter;
> import org.apache.nutch.parse.ParseResult;
> import org.apache.nutch.protocol.Content;
> 
> // Commons imports
> import org.apache.commons.logging.Log;
> import org.apache.commons.logging.LogFactory;
> 
> // W3C imports
> import org.w3c.dom.DocumentFragment;
> 
> public class DescriptionParser implements HtmlParseFilter {
> 
>   private static final Log LOG =
> LogFactory.getLog(DescriptionParser.class.getName());
> 
>   private Configuration conf;
> 
>   /** The Description meta data attribute name */
>   public static final String META_DESCRIPTION_NAME = "content-type";
> 
>   /**
>    * Scan the HTML document looking for a description meta tag.
>    */
>   public ParseResult filter(Content content, ParseResult parseResult,
> HTMLMetaTags metaTags, DocumentFragment doc) {
> 
> 
> 
>               Parse parse = parseResult.get(content.getUrl());
> 
>     // Trying to find the document's description tag
>     String desc = null;
> 
>     Properties HttpMetaTags = metaTags.getHttpEquivTags() ;
> 
>     for (Enumeration tagNames = HttpMetaTags.propertyNames();
> tagNames.hasMoreElements(); ) {
>         if (tagNames.nextElement().equals("content-type")) {
>            desc = HttpMetaTags.getProperty("content-type");
>            if(desc == null)
>                    LOG.info("No http-equiv tag for this page");
>            else if(desc.equals("")) {
>                    LOG.info("Found an empty http-equiv tag");
>            } else {
>                    LOG.info("Found an http-equiv tag; contents: " + desc);
>            }
>         }
>     }
> 
>     if( desc.equals("text/html; charset=UTF-8") ) {
>         LOG.info("Adding http-equiv; contents: " + desc);
>        parse.getData().getContentMeta().set(META_DESCRIPTION_NAME, desc);
> 
>     }
> 
> 
>     return parseResult;
>   }
> 
> 
>   public void setConf(Configuration conf) {
>     this.conf = conf;
>   }
> 
>   public Configuration getConf() {
>     return this.conf;
>   }
> }
> and DescriptionIndexer is
> package org.apache.nutch.parse.description;
> 
> import org.apache.commons.logging.Log;
> import org.apache.commons.logging.LogFactory;
> import java.io.*;
> import org.apache.lucene.document.DateTools;
> 
> import org.apache.nutch.metadata.Nutch;
> import org.apache.nutch.parse.Parse;
> 
> import org.apache.nutch.indexer.IndexingFilter;
> import org.apache.nutch.indexer.IndexingException;
> import org.apache.nutch.indexer.NutchDocument;
> import org.apache.nutch.indexer.lucene.LuceneWriter;
> import org.apache.hadoop.io.Text;
> 
> import org.apache.nutch.crawl.CrawlDatum;
> import org.apache.nutch.crawl.Inlinks;
> 
> import java.net.MalformedURLException;
> import java.net.URL;
> import org.apache.hadoop.conf.Configuration;
> 
> 
> public class DescriptionIndexer implements IndexingFilter {
> 
>         public static final Log LOG =
> LogFactory.getLog(DescriptionIndexer.class.getName());
> 
>         private Configuration conf;
> 
>         public DescriptionIndexer() {
> 
>         }
> 
>         public NutchDocument filter(NutchDocument doc, Parse parse, Text
> url, CrawlDatum datum, Inlinks inlinks)  {
> 
>                 String desc = parse.getData().getMeta("content-type");
> 
>                 if(!(desc.equals("text/html; charset=UTF-8")) ) {
>                 return null;
>                 }
> 
>                 return doc;
>         }
> public void addIndexBackendOptions(Configuration conf) {
> LuceneWriter.addFieldOptions("content-type", LuceneWriter.STORE.YES,
> LuceneWriter.INDEX.NO, conf);
> 
> }
>         public void setConf(Configuration conf) {
>                 this.conf = conf;
>         }
> 
>         public Configuration getConf() {
>                 return this.conf;
>         }
> }
> 
> when i started to crawl
> the following excetptions occur which shows that desc string is null
> but i ve seed two urls one with content type "text/html; charset=UTF-8" and
> other of text/html; charset=iso-8859-1
> please help me to solve this problem
> how can i see what is in desc during the crawl
> 
> Regards
> Amna Waqar

-- 
Markus Jelsma - CTO - Openindex
http://www.linkedin.com/in/markus17
050-8536620 / 06-50258350