You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/02/24 13:44:52 UTC
svn commit: r1293228 - in /nutch/branches/nutchgora: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/fetcher/FetcherReducer.java src/java/org/apache/nutch/parse/ParserJob.java

Author: ferdy
Date: Fri Feb 24 12:44:52 2012
New Revision: 1293228

URL: http://svn.apache.org/viewvc?rev=1293228&view=rev
Log:
REVERT NUTCH-965 Skip parsing for truncated document

Modified:
    nutch/branches/nutchgora/CHANGES.txt
    nutch/branches/nutchgora/conf/nutch-default.xml
    nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java
    nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java

Modified: nutch/branches/nutchgora/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1293228&r1=1293227&r2=1293228&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Fri Feb 24 12:44:52 2012
@@ -2,8 +2,6 @@ Nutch Change Log
 
 Release nutchgora - Current Development
 
-* NUTCH-965 Skip parsing for truncated documents (alexis, lewismc, ferdy)
-
 * NUTCH-1287 Upgrade to hsqldb 2.2.8 (ferdy)
 
 * NUTCH-1280 language-identifier should have option to use detected value by Tika even when uncertain (ferdy)

Modified: nutch/branches/nutchgora/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/conf/nutch-default.xml?rev=1293228&r1=1293227&r2=1293228&view=diff
==============================================================================
--- nutch/branches/nutchgora/conf/nutch-default.xml (original)
+++ nutch/branches/nutchgora/conf/nutch-default.xml Fri Feb 24 12:44:52 2012
@@ -909,14 +909,6 @@
   </description>
 </property>
 
-<property>
-  <name>parser.skip.truncated</name>
-  <value>true</value>
-  <description>Boolean value for whether we should skip parsing for truncated documents. By default this 
-  property is activated due to extremely high levels of CPU which parsing can sometimes take.  
-  </description>
-</property>
-
 <!-- urlfilter plugin properties -->
 
 <property>

Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1293228&r1=1293227&r2=1293228&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java Fri Feb 24 12:44:52 2012
@@ -43,7 +43,6 @@ import org.apache.nutch.net.URLFilterExc
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParserJob;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
@@ -82,7 +81,6 @@ extends GoraReducer<IntWritable, FetchEn
   private boolean parse;
 
   private ParseUtil parseUtil;
-  private boolean skipTruncated;
 
   /**
    * This class described the item to be fetched.
@@ -606,12 +604,10 @@ extends GoraReducer<IntWritable, FetchEn
       String key = TableUtil.reverseUrl(fit.url);
 
       if (parse) {
-        if (!skipTruncated || (skipTruncated && !ParserJob.isTruncated(fit.url, fit.page))) {
-          URLWebPage redirectedPage = parseUtil.process(key, fit.page);
-          if (redirectedPage != null) {
-            context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
-                redirectedPage.getDatum());
-          }
+        URLWebPage redirectedPage = parseUtil.process(key, fit.page);
+        if (redirectedPage != null) {
+          context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
+                        redirectedPage.getDatum());
         }
       }
       context.write(key, fit.page);
@@ -727,7 +723,6 @@ extends GoraReducer<IntWritable, FetchEn
     int threadCount = conf.getInt("fetcher.threads.fetch", 10);
     parse = conf.getBoolean(FetcherJob.PARSE_KEY, false);
     if (parse) {
-      skipTruncated=conf.getBoolean(ParserJob.SKIP_TRUNCATED, true);
       parseUtil = new ParseUtil(conf);
     }
     LOG.info("Fetcher: threads: " + threadCount);

Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java?rev=1293228&r1=1293227&r2=1293228&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java Fri Feb 24 12:44:52 2012
@@ -17,7 +17,6 @@
 package org.apache.nutch.parse;
 
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Map;
@@ -32,7 +31,6 @@ import org.apache.hadoop.util.ToolRunner
 import org.apache.nutch.crawl.GeneratorJob;
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.crawl.URLWebPage;
-import org.apache.nutch.metadata.HttpHeaders;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.storage.Mark;
 import org.apache.nutch.storage.ParseStatus;
@@ -42,7 +40,6 @@ import org.apache.nutch.util.IdentityPag
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
-import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.util.TableUtil;
 import org.apache.nutch.util.ToolUtil;
 import org.apache.gora.mapreduce.GoraMapper;
@@ -53,8 +50,6 @@ public class ParserJob extends NutchTool
 
   private static final String RESUME_KEY = "parse.job.resume";
   private static final String FORCE_KEY = "parse.job.force";
-  
-  public static final String SKIP_TRUNCATED = "parser.skip.truncated";
 
   private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
 
@@ -69,7 +64,6 @@ public class ParserJob extends NutchTool
     FIELDS.add(WebPage.Field.PARSE_STATUS);
     FIELDS.add(WebPage.Field.OUTLINKS);
     FIELDS.add(WebPage.Field.METADATA);
-    FIELDS.add(WebPage.Field.HEADERS);
   }
 
 
@@ -82,8 +76,6 @@ public class ParserJob extends NutchTool
     private boolean force;
 
     private Utf8 batchId;
-
-    private boolean skipTruncated;
     
     @Override
     public void setup(Context context) throws IOException {
@@ -92,7 +84,6 @@ public class ParserJob extends NutchTool
       shouldResume = conf.getBoolean(RESUME_KEY, false);
       force = conf.getBoolean(FORCE_KEY, false);
       batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
-      skipTruncated=conf.getBoolean(SKIP_TRUNCATED, true);
     }
 
     @Override
@@ -115,11 +106,6 @@ public class ParserJob extends NutchTool
         LOG.info("Parsing " + unreverseKey);
       }
 
-      if (skipTruncated && isTruncated(unreverseKey, page)) {
-        return;
-      }
-      
-
       URLWebPage redirectedPage = parseUtil.process(key, page);
       ParseStatus pstatus = page.getParseStatus();
       if (pstatus != null) {
@@ -142,45 +128,6 @@ public class ParserJob extends NutchTool
   public ParserJob(Configuration conf) {
     setConf(conf);
   }
-  
-  /**
-   * Checks if the page's content is truncated.
-   * @param url 
-   * @param page
-   * @return If the page is truncated <code>true</code>. When it is not,
-   * or when it could be determined, <code>false</code>. 
-   */
-  public static boolean isTruncated(String url, WebPage page) {
-    ByteBuffer content = page.getContent();
-    if (content == null) {
-      return false;
-    }
-    Utf8 lengthUtf8 = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_LENGTH));
-    if (lengthUtf8 == null) {
-      return false;
-    }
-    String lengthStr = lengthUtf8.toString().trim();
-    if (StringUtil.isEmpty(lengthStr)) {
-      return false;
-    }
-    int inHeaderSize;
-    try {
-      inHeaderSize = Integer.parseInt(lengthStr);
-    } catch (NumberFormatException e) {
-      LOG.warn("Wrong contentlength format for " + url, e);
-      return false;
-    }
-    int actualSize = content.limit();
-    if (inHeaderSize > actualSize) {
-      LOG.warn(url + " skipped. Content of size " + inHeaderSize
-          + " was truncated to " + actualSize);
-      return true;
-    }
-    if (LOG.isDebugEnabled()) {
-      LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
-    }
-    return false;
-  }
 
   public Collection<WebPage.Field> getFields(Job job) {
     Configuration conf = job.getConfiguration();