You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/02/22 09:41:58 UTC
svn commit: r1292185 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
src/java/org/apache/nutch/fetcher/Fetcher.java
src/java/org/apache/nutch/parse/ParseSegment.java
Author: ferdy
Date: Wed Feb 22 08:41:58 2012
New Revision: 1292185
URL: http://svn.apache.org/viewvc?rev=1292185&view=rev
Log:
NUTCH-965 Skip parsing for truncated documents
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1292185&r1=1292184&r2=1292185&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Feb 22 08:41:58 2012
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-965 Skip parsing for truncated documents (alexis, lewismc, ferdy)
+
* NUTCH-1193 Incorrect url transform to lowercase: parameter solr (Eduardo dos Santos Leggiero via lewismc)
* NUTCH-1272 Wrong property name for index-static in nutch-default.xml (Daniel Baur via jnioche)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1292185&r1=1292184&r2=1292185&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Feb 22 08:41:58 2012
@@ -1062,6 +1062,14 @@
<description>Whether the parser will normalize URLs (with the configured URL normalizers).</description>
</property>
+<property>
+ <name>parser.skip.truncated</name>
+ <value>true</value>
+ <description>Boolean value for whether we should skip parsing for truncated documents. By default this
+ property is activated due to extremely high levels of CPU which parsing can sometimes take.
+ </description>
+</property>
+
<!-- urlfilter plugin properties -->
<property>
Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1292185&r1=1292184&r2=1292185&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Feb 22 08:41:58 2012
@@ -589,6 +589,7 @@ public class Fetcher extends Configured
private int maxOutlinkDepth;
private int maxOutlinkDepthNumLinks;
private int outlinksDepthDivisor;
+ private boolean checkTruncated;
public FetcherThread(Configuration conf) {
this.setDaemon(true); // don't hang JVM on exit
@@ -597,6 +598,7 @@ public class Fetcher extends Configured
this.urlFilters = new URLFilters(conf);
this.scfilters = new ScoringFilters(conf);
this.parseUtil = new ParseUtil(conf);
+ this.checkTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
this.protocolFactory = new ProtocolFactory(conf);
this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
@@ -945,17 +947,19 @@ public class Fetcher extends Configured
/* Note: Fetcher will only follow meta-redirects coming from the
* original URL. */
if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
- try {
- parseResult = this.parseUtil.parse(content);
- } catch (Exception e) {
- LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
- }
-
- if (parseResult == null) {
- byte[] signature =
- SignatureFactory.getSignature(getConf()).calculate(content,
- new ParseStatus().getEmptyParse(conf));
- datum.setSignature(signature);
+ if (!checkTruncated || (checkTruncated && ParseSegment.isTruncated(content))) {
+ try {
+ parseResult = this.parseUtil.parse(content);
+ } catch (Exception e) {
+ LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
+ }
+
+ if (parseResult == null) {
+ byte[] signature =
+ SignatureFactory.getSignature(getConf()).calculate(content,
+ new ParseStatus().getEmptyParse(conf));
+ datum.setSignature(signature);
+ }
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1292185&r1=1292184&r2=1292185&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Wed Feb 22 08:41:58 2012
@@ -17,26 +17,44 @@
package org.apache.nutch.parse;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.Iterator;
+import java.util.Map.Entry;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.SignatureFactory;
-import org.apache.hadoop.io.*;
-import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.*;
-import org.apache.hadoop.conf.*;
+import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.protocol.*;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
-import org.apache.nutch.util.*;
-import org.apache.hadoop.fs.Path;
-
-import java.io.*;
-import java.text.SimpleDateFormat;
-import java.util.*;
-import java.util.Map.Entry;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.TimingUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/* Parse content in a segment. */
public class ParseSegment extends Configured implements Tool,
@@ -45,8 +63,12 @@ public class ParseSegment extends Config
public static final Logger LOG = LoggerFactory.getLogger(ParseSegment.class);
+ public static final String SKIP_TRUNCATED = "parser.skip.truncated";
+
private ScoringFilters scfilters;
+ private boolean skipTruncated;
+
public ParseSegment() {
this(null);
}
@@ -58,6 +80,7 @@ public class ParseSegment extends Config
public void configure(JobConf job) {
setConf(job);
this.scfilters = new ScoringFilters(job);
+ skipTruncated=job.getBoolean(SKIP_TRUNCATED, true);
}
public void close() {}
@@ -80,6 +103,10 @@ public class ParseSegment extends Config
LOG.debug("Skipping " + key + " as content is not fetched successfully");
return;
}
+
+ if (isTruncated(content)) {
+ return;
+ }
ParseResult parseResult = null;
try {
@@ -128,6 +155,38 @@ public class ParseSegment extends Config
parse.getData(), parse.isCanonical()));
}
}
+
+ /**
+ * Checks if the page's content is truncated.
+ * @param content
+ * @return If the page is truncated <code>true</code>. When it is not,
+ * or when it could be determined, <code>false</code>.
+ */
+ public static boolean isTruncated(Content content) {
+ byte[] contentBytes = content.getContent();
+ if (contentBytes == null) return false;
+ Metadata metadata = content.getMetadata();
+ if (metadata == null) return false;
+
+ String lengthStr = metadata.get(Response.CONTENT_LENGTH);
+ if (lengthStr != null) lengthStr=lengthStr.trim();
+ if (StringUtil.isEmpty(lengthStr)) {
+ return false;
+ }
+ int contentLength;
+ try {
+ contentLength = Integer.parseInt(lengthStr);
+ } catch (NumberFormatException e) {
+ LOG.warn("Wrong contentlength format for " + content.getUrl(), e);
+ return false;
+ }
+ if (contentLength > contentBytes.length) {
+ LOG.info(content.getUrl() + " skipped. Content of size " + contentLength
+ + " was truncated to " + contentBytes.length);
+ return true;
+ }
+ return false;
+ }
public void reduce(Text key, Iterator<Writable> values,
OutputCollector<Text, Writable> output, Reporter reporter)