You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/11/06 23:00:19 UTC
svn commit: r1637237 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
src/java/org/apache/nutch/parse/ParserChecker.java
Author: snagel
Date: Thu Nov 6 22:00:18 2014
New Revision: 1637237
URL: http://svn.apache.org/r1637237
Log:
NUTCH-1884 NullPointerException in parsechecker and indexchecker with symlinks in file URL
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1637237&r1=1637236&r2=1637237&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Nov 6 22:00:18 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1884 NullPointerException in parsechecker and indexchecker with symlinks in file URL (Mengying Wang, snagel)
+
* NUTCH-1825 protocol-http may hang for certain web pages (Phu Kieu via snagel)
* NUTCH-1483 Can't crawl filesystem with protocol-file plugin (Rogério Pereira Araújo, Mengying Wang, snagel)
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1637237&r1=1637236&r2=1637237&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Thu Nov 6 22:00:18 2014
@@ -17,7 +17,9 @@
package org.apache.nutch.indexer;
+import java.util.Iterator;
import java.util.List;
+import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
@@ -133,6 +135,16 @@ public class IndexingFiltersChecker exte
Inlinks inlinks = null;
Parse parse = parseResult.get(urlText);
+ if (parse == null) {
+ LOG.error("Failed to get parse from parse result");
+ LOG.error("Available parses in parse result (by URL key):");
+ for (Map.Entry<Text, Parse> entry : parseResult) {
+ LOG.error(" " + entry.getKey());
+ }
+ LOG.error("Parse result does not contain a parse for URL to be checked:");
+ LOG.error(" " + urlText);
+ return -1;
+ }
byte[] signature = SignatureFactory.getSignature(conf).calculate(content,
parse);
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1637237&r1=1637236&r2=1637237&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Thu Nov 6 22:00:18 2014
@@ -19,11 +19,13 @@ package org.apache.nutch.parse;
import java.util.HashMap;
import java.util.Iterator;
+import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
@@ -164,14 +166,15 @@ public class ParserChecker implements To
scfilters.passScoreBeforeParsing(turl, cd, content);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
- LOG.warn("Couldn't pass score, url " + turl.toString() + " (" + e + ")");
+ LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e + ")");
+ LOG.warn(StringUtils.stringifyException(e));
}
}
ParseResult parseResult = new ParseUtil(conf).parse(content);
if (parseResult == null) {
- LOG.error("Problem with parse - check log");
+ LOG.error("Parsing content failed!");
return (-1);
}
@@ -184,17 +187,30 @@ public class ParserChecker implements To
LOG.info("signature: " + StringUtil.toHexString(signature));
}
+ Parse parse = parseResult.get(turl);
+ if (parse == null) {
+ LOG.error("Failed to get parse from parse result");
+ LOG.error("Available parses in parse result (by URL key):");
+ for (Map.Entry<Text, Parse> entry : parseResult) {
+ LOG.error(" " + entry.getKey());
+ }
+ LOG.error("Parse result does not contain a parse for URL to be checked:");
+ LOG.error(" " + turl);
+ return -1;
+ }
+
// call the scoring filters
try {
- scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
+ scfilters.passScoreAfterParsing(turl, content, parse);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
- LOG.warn("Couldn't pass score, url " + turl + " (" + e + ")");
+ LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + ")");
+ LOG.warn(StringUtils.stringifyException(e));
}
}
- for (java.util.Map.Entry<Text, Parse> entry : parseResult) {
- Parse parse = entry.getValue();
+ for (Map.Entry<Text, Parse> entry : parseResult) {
+ parse = entry.getValue();
LOG.info("---------\nUrl\n---------------\n");
System.out.print(entry.getKey());
LOG.info("\n---------\nParseData\n---------\n");