You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/02/23 08:53:52 UTC
svn commit: r1292686 - in /nutch/trunk/src/java/org/apache/nutch:
fetcher/Fetcher.java parse/ParseSegment.java
Author: ferdy
Date: Thu Feb 23 07:53:52 2012
New Revision: 1292686
URL: http://svn.apache.org/viewvc?rev=1292686&view=rev
Log:
integrate NUTCH-965 Skip parsing for truncated documents (commit 3)
Modified:
nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1292686&r1=1292685&r2=1292686&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Feb 23 07:53:52 2012
@@ -589,7 +589,7 @@ public class Fetcher extends Configured
private int maxOutlinkDepth;
private int maxOutlinkDepthNumLinks;
private int outlinksDepthDivisor;
- private boolean checkTruncated;
+ private boolean skipTruncated;
public FetcherThread(Configuration conf) {
this.setDaemon(true); // don't hang JVM on exit
@@ -598,7 +598,7 @@ public class Fetcher extends Configured
this.urlFilters = new URLFilters(conf);
this.scfilters = new ScoringFilters(conf);
this.parseUtil = new ParseUtil(conf);
- this.checkTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
+ this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
this.protocolFactory = new ProtocolFactory(conf);
this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
@@ -947,7 +947,7 @@ public class Fetcher extends Configured
/* Note: Fetcher will only follow meta-redirects coming from the
* original URL. */
if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
- if (!checkTruncated || (checkTruncated && !ParseSegment.isTruncated(content))) {
+ if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
try {
parseResult = this.parseUtil.parse(content);
} catch (Exception e) {
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1292686&r1=1292685&r2=1292686&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Thu Feb 23 07:53:52 2012
@@ -104,7 +104,7 @@ public class ParseSegment extends Config
return;
}
- if (isTruncated(content)) {
+ if (skipTruncated && isTruncated(content)) {
return;
}
@@ -173,18 +173,23 @@ public class ParseSegment extends Config
if (StringUtil.isEmpty(lengthStr)) {
return false;
}
- int contentLength;
+ int inHeaderSize;
+ String url = content.getUrl();
try {
- contentLength = Integer.parseInt(lengthStr);
+ inHeaderSize = Integer.parseInt(lengthStr);
} catch (NumberFormatException e) {
- LOG.warn("Wrong contentlength format for " + content.getUrl(), e);
+ LOG.warn("Wrong contentlength format for " + url, e);
return false;
}
- if (contentLength > contentBytes.length) {
- LOG.info(content.getUrl() + " skipped. Content of size " + contentLength
- + " was truncated to " + contentBytes.length);
+ int actualSize = contentBytes.length;
+ if (inHeaderSize > actualSize) {
+ LOG.info(url + " skipped. Content of size " + inHeaderSize
+ + " was truncated to " + actualSize);
return true;
}
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
+ }
return false;
}