You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/02/23 08:53:03 UTC

svn commit: r1292684 - /nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java

Author: ferdy
Date: Thu Feb 23 07:53:03 2012
New Revision: 1292684

URL: http://svn.apache.org/viewvc?rev=1292684&view=rev
Log:
integrate NUTCH-965 Skip parsing for truncated documents (commit 3)

Modified:
    nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java

Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java?rev=1292684&r1=1292683&r2=1292684&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java Thu Feb 23 07:53:03 2012
@@ -115,10 +115,8 @@ public class ParserJob extends NutchTool
         LOG.info("Parsing " + unreverseKey);
       }
 
-      if (skipTruncated) {
-        if (isTruncated(unreverseKey, page)) {
-          return;
-        }
+      if (skipTruncated && isTruncated(unreverseKey, page)) {
+        return;
       }
       
 
@@ -165,19 +163,22 @@ public class ParserJob extends NutchTool
     if (StringUtil.isEmpty(lengthStr)) {
       return false;
     }
-    int contentLength;
+    int inHeaderSize;
     try {
-      contentLength = Integer.parseInt(lengthStr);
+      inHeaderSize = Integer.parseInt(lengthStr);
     } catch (NumberFormatException e) {
       LOG.warn("Wrong contentlength format for " + url, e);
       return false;
     }
-    if (contentLength > content.limit()) {
-      LOG.info(url + " skipped. Content of size " + contentLength
-          + " was truncated to " + content.limit());
+    int actualSize = content.limit();
+    if (inHeaderSize > actualSize) {
+      LOG.warn(url + " skipped. Content of size " + inHeaderSize
+          + " was truncated to " + actualSize);
       return true;
     }
-    LOG.info(url + " actual=" + content.limit() + " inHeader=" + contentLength);
+    if (LOG.isDebugEnabled()) {
+      LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
+    }
     return false;
   }