You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/02/23 08:53:03 UTC
svn commit: r1292684 -
/nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java
Author: ferdy
Date: Thu Feb 23 07:53:03 2012
New Revision: 1292684
URL: http://svn.apache.org/viewvc?rev=1292684&view=rev
Log:
integrate NUTCH-965 Skip parsing for truncated documents (commit 3)
Modified:
nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java
Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java?rev=1292684&r1=1292683&r2=1292684&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java Thu Feb 23 07:53:03 2012
@@ -115,10 +115,8 @@ public class ParserJob extends NutchTool
LOG.info("Parsing " + unreverseKey);
}
- if (skipTruncated) {
- if (isTruncated(unreverseKey, page)) {
- return;
- }
+ if (skipTruncated && isTruncated(unreverseKey, page)) {
+ return;
}
@@ -165,19 +163,22 @@ public class ParserJob extends NutchTool
if (StringUtil.isEmpty(lengthStr)) {
return false;
}
- int contentLength;
+ int inHeaderSize;
try {
- contentLength = Integer.parseInt(lengthStr);
+ inHeaderSize = Integer.parseInt(lengthStr);
} catch (NumberFormatException e) {
LOG.warn("Wrong contentlength format for " + url, e);
return false;
}
- if (contentLength > content.limit()) {
- LOG.info(url + " skipped. Content of size " + contentLength
- + " was truncated to " + content.limit());
+ int actualSize = content.limit();
+ if (inHeaderSize > actualSize) {
+ LOG.warn(url + " skipped. Content of size " + inHeaderSize
+ + " was truncated to " + actualSize);
return true;
}
- LOG.info(url + " actual=" + content.limit() + " inHeader=" + contentLength);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
+ }
return false;
}