You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/11/06 23:00:19 UTC

svn commit: r1637237 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java src/java/org/apache/nutch/parse/ParserChecker.java

Author: snagel
Date: Thu Nov  6 22:00:18 2014
New Revision: 1637237

URL: http://svn.apache.org/r1637237
Log:
NUTCH-1884 NullPointerException in parsechecker and indexchecker with symlinks in file URL

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
    nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1637237&r1=1637236&r2=1637237&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Nov  6 22:00:18 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1884 NullPointerException in parsechecker and indexchecker with symlinks in file URL (Mengying Wang, snagel)
+
 * NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via snagel)
 
 * NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério Pereira Araújo, Mengying Wang, snagel)

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1637237&r1=1637236&r2=1637237&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Thu Nov  6 22:00:18 2014
@@ -17,7 +17,9 @@
  
 package org.apache.nutch.indexer;
 
+import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
@@ -133,6 +135,16 @@ public class IndexingFiltersChecker exte
 
     Inlinks inlinks = null;
     Parse parse = parseResult.get(urlText);
+    if (parse == null) {
+      LOG.error("Failed to get parse from parse result");
+      LOG.error("Available parses in parse result (by URL key):");
+      for (Map.Entry<Text, Parse> entry : parseResult) {
+        LOG.error("  " + entry.getKey());
+      }
+      LOG.error("Parse result does not contain a parse for URL to be checked:");
+      LOG.error("  " + urlText);
+      return -1;
+    }
 
     byte[] signature = SignatureFactory.getSignature(conf).calculate(content,
         parse);

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1637237&r1=1637236&r2=1637237&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Thu Nov  6 22:00:18 2014
@@ -19,11 +19,13 @@ package org.apache.nutch.parse;
 
 import java.util.HashMap;
 import java.util.Iterator;
+import java.util.Map;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -164,14 +166,15 @@ public class ParserChecker implements To
       scfilters.passScoreBeforeParsing(turl, cd, content);
     } catch (Exception e) {
       if (LOG.isWarnEnabled()) {
-        LOG.warn("Couldn't pass score, url " + turl.toString() + " (" + e + ")");
+        LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e + ")");
+        LOG.warn(StringUtils.stringifyException(e));
       }
     }    
     
     ParseResult parseResult = new ParseUtil(conf).parse(content);
 
     if (parseResult == null) {
-      LOG.error("Problem with parse - check log");
+      LOG.error("Parsing content failed!");
       return (-1);
     }
 
@@ -184,17 +187,30 @@ public class ParserChecker implements To
       LOG.info("signature: " + StringUtil.toHexString(signature));
     }
 
+    Parse parse = parseResult.get(turl);
+    if (parse == null) {
+      LOG.error("Failed to get parse from parse result");
+      LOG.error("Available parses in parse result (by URL key):");
+      for (Map.Entry<Text, Parse> entry : parseResult) {
+        LOG.error("  " + entry.getKey());
+      }
+      LOG.error("Parse result does not contain a parse for URL to be checked:");
+      LOG.error("  " + turl);
+      return -1;
+    }
+
     // call the scoring filters
     try {
-      scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
+      scfilters.passScoreAfterParsing(turl, content, parse);
     } catch (Exception e) {
       if (LOG.isWarnEnabled()) {
-        LOG.warn("Couldn't pass score, url " + turl + " (" + e + ")");
+        LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + ")");
+        LOG.warn(StringUtils.stringifyException(e));
       }
     }
 
-    for (java.util.Map.Entry<Text, Parse> entry : parseResult) {
-      Parse parse = entry.getValue();
+    for (Map.Entry<Text, Parse> entry : parseResult) {
+      parse = entry.getValue();
       LOG.info("---------\nUrl\n---------------\n");
       System.out.print(entry.getKey());
       LOG.info("\n---------\nParseData\n---------\n");