You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/02/24 13:36:10 UTC

svn commit: r1293225 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/fetcher/Fetcher.java src/java/org/apache/nutch/parse/ParseSegment.java

Author: ferdy
Date: Fri Feb 24 12:36:09 2012
New Revision: 1293225

URL: http://svn.apache.org/viewvc?rev=1293225&view=rev
Log:
REVERT NUTCH-965 Skip parsing for truncated document

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1293225&r1=1293224&r2=1293225&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Feb 24 12:36:09 2012
@@ -2,8 +2,6 @@ Nutch Change Log
 
 * NUTCH-1210 DomainBlacklistFilter (markus)
 
-* NUTCH-965 Skip parsing for truncated documents (alexis, lewismc, ferdy)
-
 * NUTCH-1193 Incorrect url transform to lowercase: parameter solr (Eduardo dos Santos Leggiero via lewismc)
 
 * NUTCH-1272 Wrong property name for index-static in nutch-default.xml (Daniel Baur via jnioche)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1293225&r1=1293224&r2=1293225&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Feb 24 12:36:09 2012
@@ -1062,14 +1062,6 @@
   <description>Whether the parser will normalize URLs (with the configured URL normalizers).</description>
 </property>
 
-<property>
-  <name>parser.skip.truncated</name>
-  <value>true</value>
-  <description>Boolean value for whether we should skip parsing for truncated documents. By default this 
-  property is activated due to extremely high levels of CPU which parsing can sometimes take.  
-  </description>
-</property>
-
 <!-- urlfilter plugin properties -->
 
 <property>

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1293225&r1=1293224&r2=1293225&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Feb 24 12:36:09 2012
@@ -589,7 +589,6 @@ public class Fetcher extends Configured 
     private int maxOutlinkDepth;
     private int maxOutlinkDepthNumLinks;
     private int outlinksDepthDivisor;
-    private boolean skipTruncated;
 
     public FetcherThread(Configuration conf) {
       this.setDaemon(true);                       // don't hang JVM on exit
@@ -598,7 +597,6 @@ public class Fetcher extends Configured 
       this.urlFilters = new URLFilters(conf);
       this.scfilters = new ScoringFilters(conf);
       this.parseUtil = new ParseUtil(conf);
-      this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
       this.protocolFactory = new ProtocolFactory(conf);
       this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
       this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
@@ -947,19 +945,17 @@ public class Fetcher extends Configured 
         /* Note: Fetcher will only follow meta-redirects coming from the
          * original URL. */
         if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
-          if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
-            try {
-              parseResult = this.parseUtil.parse(content);
-            } catch (Exception e) {
-              LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
-            }
-  
-            if (parseResult == null) {
-              byte[] signature =
-                SignatureFactory.getSignature(getConf()).calculate(content,
-                    new ParseStatus().getEmptyParse(conf));
-              datum.setSignature(signature);
-            }
+          try {
+            parseResult = this.parseUtil.parse(content);
+          } catch (Exception e) {
+            LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
+          }
+
+          if (parseResult == null) {
+            byte[] signature =
+              SignatureFactory.getSignature(getConf()).calculate(content,
+                  new ParseStatus().getEmptyParse(conf));
+            datum.setSignature(signature);
           }
         }
 

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1293225&r1=1293224&r2=1293225&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Fri Feb 24 12:36:09 2012
@@ -17,44 +17,26 @@
 
 package org.apache.nutch.parse;
 
-import java.io.IOException;
-import java.text.SimpleDateFormat;
-import java.util.Iterator;
-import java.util.Map.Entry;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.UTF8;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.mapred.FileInputFormat;
-import org.apache.hadoop.mapred.FileOutputFormat;
-import org.apache.hadoop.mapred.JobClient;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.Mapper;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapred.Reducer;
-import org.apache.hadoop.mapred.Reporter;
-import org.apache.hadoop.mapred.SequenceFileInputFormat;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.SignatureFactory;
-import org.apache.nutch.metadata.Metadata;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.*;
+import org.apache.hadoop.conf.*;
 import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.*;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.StringUtil;
-import org.apache.nutch.util.TimingUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.nutch.util.*;
+import org.apache.hadoop.fs.Path;
+
+import java.io.*;
+import java.text.SimpleDateFormat;
+import java.util.*;
+import java.util.Map.Entry;
 
 /* Parse content in a segment. */
 public class ParseSegment extends Configured implements Tool,
@@ -63,12 +45,8 @@ public class ParseSegment extends Config
 
   public static final Logger LOG = LoggerFactory.getLogger(ParseSegment.class);
   
-  public static final String SKIP_TRUNCATED = "parser.skip.truncated";
-  
   private ScoringFilters scfilters;
   
-  private boolean skipTruncated;
-  
   public ParseSegment() {
     this(null);
   }
@@ -80,7 +58,6 @@ public class ParseSegment extends Config
   public void configure(JobConf job) {
     setConf(job);
     this.scfilters = new ScoringFilters(job);
-    skipTruncated=job.getBoolean(SKIP_TRUNCATED, true);
   }
 
   public void close() {}
@@ -103,10 +80,6 @@ public class ParseSegment extends Config
       LOG.debug("Skipping " + key + " as content is not fetched successfully");
       return;
     }
-    
-    if (skipTruncated && isTruncated(content)) {
-      return;
-    }
 
     ParseResult parseResult = null;
     try {
@@ -155,43 +128,6 @@ public class ParseSegment extends Config
                                         parse.getData(), parse.isCanonical()));
     }
   }
-  
-  /**
-   * Checks if the page's content is truncated.
-   * @param content
-   * @return If the page is truncated <code>true</code>. When it is not,
-   * or when it could be determined, <code>false</code>. 
-   */
-  public static boolean isTruncated(Content content) {
-    byte[] contentBytes = content.getContent();
-    if (contentBytes == null) return false;
-    Metadata metadata = content.getMetadata();
-    if (metadata == null) return false;
-    
-    String lengthStr = metadata.get(Response.CONTENT_LENGTH);
-    if (lengthStr != null) lengthStr=lengthStr.trim();
-    if (StringUtil.isEmpty(lengthStr)) {
-      return false;
-    }
-    int inHeaderSize;
-    String url = content.getUrl();
-    try {
-      inHeaderSize = Integer.parseInt(lengthStr);
-    } catch (NumberFormatException e) {
-      LOG.warn("Wrong contentlength format for " + url, e);
-      return false;
-    }
-    int actualSize = contentBytes.length;
-    if (inHeaderSize > actualSize) {
-      LOG.info(url + " skipped. Content of size " + inHeaderSize
-          + " was truncated to " + actualSize);
-      return true;
-    }
-    if (LOG.isDebugEnabled()) {
-      LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
-    }
-    return false;
-  }
 
   public void reduce(Text key, Iterator<Writable> values,
                      OutputCollector<Text, Writable> output, Reporter reporter)