You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/02/23 08:53:52 UTC

svn commit: r1292686 - in /nutch/trunk/src/java/org/apache/nutch: fetcher/Fetcher.java parse/ParseSegment.java

Author: ferdy
Date: Thu Feb 23 07:53:52 2012
New Revision: 1292686

URL: http://svn.apache.org/viewvc?rev=1292686&view=rev
Log:
integrate NUTCH-965 Skip parsing for truncated documents (commit 3)

Modified:
    nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1292686&r1=1292685&r2=1292686&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Feb 23 07:53:52 2012
@@ -589,7 +589,7 @@ public class Fetcher extends Configured 
     private int maxOutlinkDepth;
     private int maxOutlinkDepthNumLinks;
     private int outlinksDepthDivisor;
-    private boolean checkTruncated;
+    private boolean skipTruncated;
 
     public FetcherThread(Configuration conf) {
       this.setDaemon(true);                       // don't hang JVM on exit
@@ -598,7 +598,7 @@ public class Fetcher extends Configured 
       this.urlFilters = new URLFilters(conf);
       this.scfilters = new ScoringFilters(conf);
       this.parseUtil = new ParseUtil(conf);
-      this.checkTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
+      this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
       this.protocolFactory = new ProtocolFactory(conf);
       this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
       this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
@@ -947,7 +947,7 @@ public class Fetcher extends Configured 
         /* Note: Fetcher will only follow meta-redirects coming from the
          * original URL. */
         if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
-          if (!checkTruncated || (checkTruncated && !ParseSegment.isTruncated(content))) {
+          if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
             try {
               parseResult = this.parseUtil.parse(content);
             } catch (Exception e) {

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1292686&r1=1292685&r2=1292686&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Thu Feb 23 07:53:52 2012
@@ -104,7 +104,7 @@ public class ParseSegment extends Config
       return;
     }
     
-    if (isTruncated(content)) {
+    if (skipTruncated && isTruncated(content)) {
       return;
     }
 
@@ -173,18 +173,23 @@ public class ParseSegment extends Config
     if (StringUtil.isEmpty(lengthStr)) {
       return false;
     }
-    int contentLength;
+    int inHeaderSize;
+    String url = content.getUrl();
     try {
-      contentLength = Integer.parseInt(lengthStr);
+      inHeaderSize = Integer.parseInt(lengthStr);
     } catch (NumberFormatException e) {
-      LOG.warn("Wrong contentlength format for " + content.getUrl(), e);
+      LOG.warn("Wrong contentlength format for " + url, e);
       return false;
     }
-    if (contentLength > contentBytes.length) {
-      LOG.info(content.getUrl() + " skipped. Content of size " + contentLength
-          + " was truncated to " + contentBytes.length);
+    int actualSize = contentBytes.length;
+    if (inHeaderSize > actualSize) {
+      LOG.info(url + " skipped. Content of size " + inHeaderSize
+          + " was truncated to " + actualSize);
       return true;
     }
+    if (LOG.isDebugEnabled()) {
+      LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
+    }
     return false;
   }