You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/02/22 09:41:47 UTC

svn commit: r1292184 - in /nutch/branches/nutchgora: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/fetcher/FetcherReducer.java src/java/org/apache/nutch/parse/ParserJob.java

Author: ferdy
Date: Wed Feb 22 08:41:46 2012
New Revision: 1292184

URL: http://svn.apache.org/viewvc?rev=1292184&view=rev
Log:
NUTCH-965 Skip parsing for truncated documents

Modified:
    nutch/branches/nutchgora/CHANGES.txt
    nutch/branches/nutchgora/conf/nutch-default.xml
    nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java
    nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java

Modified: nutch/branches/nutchgora/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1292184&r1=1292183&r2=1292184&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Wed Feb 22 08:41:46 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release nutchgora - Current Development
 
+* NUTCH-965 Skip parsing for truncated documents (alexis, lewismc, ferdy)
+
 * NUTCH-1287 Upgrade to hsqldb 2.2.8 (ferdy)
 
 * NUTCH-1280 language-identifier should have option to use detected value by Tika even when uncertain (ferdy)

Modified: nutch/branches/nutchgora/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/conf/nutch-default.xml?rev=1292184&r1=1292183&r2=1292184&view=diff
==============================================================================
--- nutch/branches/nutchgora/conf/nutch-default.xml (original)
+++ nutch/branches/nutchgora/conf/nutch-default.xml Wed Feb 22 08:41:46 2012
@@ -909,6 +909,14 @@
   </description>
 </property>
 
+<property>
+  <name>parser.skip.truncated</name>
+  <value>true</value>
+  <description>Boolean value for whether we should skip parsing for truncated documents. By default this 
+  property is activated due to extremely high levels of CPU which parsing can sometimes take.  
+  </description>
+</property>
+
 <!-- urlfilter plugin properties -->
 
 <property>

Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1292184&r1=1292183&r2=1292184&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java Wed Feb 22 08:41:46 2012
@@ -43,6 +43,7 @@ import org.apache.nutch.net.URLFilterExc
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParserJob;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
@@ -81,6 +82,7 @@ extends GoraReducer<IntWritable, FetchEn
   private boolean parse;
 
   private ParseUtil parseUtil;
+  private boolean skipTruncated;
 
   /**
    * This class described the item to be fetched.
@@ -604,10 +606,14 @@ extends GoraReducer<IntWritable, FetchEn
       String key = TableUtil.reverseUrl(fit.url);
 
       if (parse) {
-        URLWebPage redirectedPage = parseUtil.process(key, fit.page);
-        if (redirectedPage != null) {
-          context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
-                        redirectedPage.getDatum());
+        if (skipTruncated) {
+          if (!ParserJob.isTruncated(fit.url, fit.page)) {
+            URLWebPage redirectedPage = parseUtil.process(key, fit.page);
+            if (redirectedPage != null) {
+              context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
+                            redirectedPage.getDatum());
+            }
+          }
         }
       }
       context.write(key, fit.page);
@@ -723,6 +729,7 @@ extends GoraReducer<IntWritable, FetchEn
     int threadCount = conf.getInt("fetcher.threads.fetch", 10);
     parse = conf.getBoolean(FetcherJob.PARSE_KEY, false);
     if (parse) {
+      skipTruncated=conf.getBoolean(ParserJob.SKIP_TRUNCATED, true);
       parseUtil = new ParseUtil(conf);
     }
     LOG.info("Fetcher: threads: " + threadCount);

Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java?rev=1292184&r1=1292183&r2=1292184&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java Wed Feb 22 08:41:46 2012
@@ -17,6 +17,7 @@
 package org.apache.nutch.parse;
 
 import java.io.IOException;
+import java.nio.ByteBuffer;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Map;
@@ -31,6 +32,7 @@ import org.apache.hadoop.util.ToolRunner
 import org.apache.nutch.crawl.GeneratorJob;
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.crawl.URLWebPage;
+import org.apache.nutch.metadata.HttpHeaders;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.storage.Mark;
 import org.apache.nutch.storage.ParseStatus;
@@ -40,6 +42,7 @@ import org.apache.nutch.util.IdentityPag
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.util.TableUtil;
 import org.apache.nutch.util.ToolUtil;
 import org.apache.gora.mapreduce.GoraMapper;
@@ -50,6 +53,8 @@ public class ParserJob extends NutchTool
 
   private static final String RESUME_KEY = "parse.job.resume";
   private static final String FORCE_KEY = "parse.job.force";
+  
+  public static final String SKIP_TRUNCATED = "parser.skip.truncated";
 
   private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
 
@@ -64,6 +69,7 @@ public class ParserJob extends NutchTool
     FIELDS.add(WebPage.Field.PARSE_STATUS);
     FIELDS.add(WebPage.Field.OUTLINKS);
     FIELDS.add(WebPage.Field.METADATA);
+    FIELDS.add(WebPage.Field.HEADERS);
   }
 
 
@@ -76,6 +82,8 @@ public class ParserJob extends NutchTool
     private boolean force;
 
     private Utf8 batchId;
+
+    private boolean skipTruncated;
     
     @Override
     public void setup(Context context) throws IOException {
@@ -84,6 +92,7 @@ public class ParserJob extends NutchTool
       shouldResume = conf.getBoolean(RESUME_KEY, false);
       force = conf.getBoolean(FORCE_KEY, false);
       batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
+      skipTruncated=conf.getBoolean(SKIP_TRUNCATED, true);
     }
 
     @Override
@@ -106,6 +115,13 @@ public class ParserJob extends NutchTool
         LOG.info("Parsing " + unreverseKey);
       }
 
+      if (skipTruncated) {
+        if (isTruncated(unreverseKey, page)) {
+          return;
+        }
+      }
+      
+
       URLWebPage redirectedPage = parseUtil.process(key, page);
       ParseStatus pstatus = page.getParseStatus();
       if (pstatus != null) {
@@ -128,6 +144,42 @@ public class ParserJob extends NutchTool
   public ParserJob(Configuration conf) {
     setConf(conf);
   }
+  
+  /**
+   * Checks if the page's content is truncated.
+   * @param url 
+   * @param page
+   * @return If the page is truncated <code>true</code>. When it is not,
+   * or when it could be determined, <code>false</code>. 
+   */
+  public static boolean isTruncated(String url, WebPage page) {
+    ByteBuffer content = page.getContent();
+    if (content == null) {
+      return false;
+    }
+    Utf8 lengthUtf8 = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_LENGTH));
+    if (lengthUtf8 == null) {
+      return false;
+    }
+    String lengthStr = lengthUtf8.toString().trim();
+    if (StringUtil.isEmpty(lengthStr)) {
+      return false;
+    }
+    int contentLength;
+    try {
+      contentLength = Integer.parseInt(lengthStr);
+    } catch (NumberFormatException e) {
+      LOG.warn("Wrong contentlength format for " + url, e);
+      return false;
+    }
+    if (contentLength > content.limit()) {
+      LOG.info(url + " skipped. Content of size " + contentLength
+          + " was truncated to " + content.limit());
+      return true;
+    }
+    LOG.info(url + " actual=" + content.limit() + " inHeader=" + contentLength);
+    return false;
+  }
 
   public Collection<WebPage.Field> getFields(Job job) {
     Configuration conf = job.getConfiguration();