You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/02/22 09:41:47 UTC
svn commit: r1292184 - in /nutch/branches/nutchgora: CHANGES.txt
conf/nutch-default.xml src/java/org/apache/nutch/fetcher/FetcherReducer.java
src/java/org/apache/nutch/parse/ParserJob.java
Author: ferdy
Date: Wed Feb 22 08:41:46 2012
New Revision: 1292184
URL: http://svn.apache.org/viewvc?rev=1292184&view=rev
Log:
NUTCH-965 Skip parsing for truncated documents
Modified:
nutch/branches/nutchgora/CHANGES.txt
nutch/branches/nutchgora/conf/nutch-default.xml
nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java
nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java
Modified: nutch/branches/nutchgora/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1292184&r1=1292183&r2=1292184&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Wed Feb 22 08:41:46 2012
@@ -2,6 +2,8 @@ Nutch Change Log
Release nutchgora - Current Development
+* NUTCH-965 Skip parsing for truncated documents (alexis, lewismc, ferdy)
+
* NUTCH-1287 Upgrade to hsqldb 2.2.8 (ferdy)
* NUTCH-1280 language-identifier should have option to use detected value by Tika even when uncertain (ferdy)
Modified: nutch/branches/nutchgora/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/conf/nutch-default.xml?rev=1292184&r1=1292183&r2=1292184&view=diff
==============================================================================
--- nutch/branches/nutchgora/conf/nutch-default.xml (original)
+++ nutch/branches/nutchgora/conf/nutch-default.xml Wed Feb 22 08:41:46 2012
@@ -909,6 +909,14 @@
</description>
</property>
+<property>
+ <name>parser.skip.truncated</name>
+ <value>true</value>
+ <description>Boolean value for whether we should skip parsing for truncated documents. By default this
+ property is activated due to extremely high levels of CPU which parsing can sometimes take.
+ </description>
+</property>
+
<!-- urlfilter plugin properties -->
<property>
Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1292184&r1=1292183&r2=1292184&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java Wed Feb 22 08:41:46 2012
@@ -43,6 +43,7 @@ import org.apache.nutch.net.URLFilterExc
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParserJob;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
@@ -81,6 +82,7 @@ extends GoraReducer<IntWritable, FetchEn
private boolean parse;
private ParseUtil parseUtil;
+ private boolean skipTruncated;
/**
* This class described the item to be fetched.
@@ -604,10 +606,14 @@ extends GoraReducer<IntWritable, FetchEn
String key = TableUtil.reverseUrl(fit.url);
if (parse) {
- URLWebPage redirectedPage = parseUtil.process(key, fit.page);
- if (redirectedPage != null) {
- context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
- redirectedPage.getDatum());
+ if (skipTruncated) {
+ if (!ParserJob.isTruncated(fit.url, fit.page)) {
+ URLWebPage redirectedPage = parseUtil.process(key, fit.page);
+ if (redirectedPage != null) {
+ context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
+ redirectedPage.getDatum());
+ }
+ }
}
}
context.write(key, fit.page);
@@ -723,6 +729,7 @@ extends GoraReducer<IntWritable, FetchEn
int threadCount = conf.getInt("fetcher.threads.fetch", 10);
parse = conf.getBoolean(FetcherJob.PARSE_KEY, false);
if (parse) {
+ skipTruncated=conf.getBoolean(ParserJob.SKIP_TRUNCATED, true);
parseUtil = new ParseUtil(conf);
}
LOG.info("Fetcher: threads: " + threadCount);
Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java?rev=1292184&r1=1292183&r2=1292184&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java Wed Feb 22 08:41:46 2012
@@ -17,6 +17,7 @@
package org.apache.nutch.parse;
import java.io.IOException;
+import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
@@ -31,6 +32,7 @@ import org.apache.hadoop.util.ToolRunner
import org.apache.nutch.crawl.GeneratorJob;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.crawl.URLWebPage;
+import org.apache.nutch.metadata.HttpHeaders;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.storage.Mark;
import org.apache.nutch.storage.ParseStatus;
@@ -40,6 +42,7 @@ import org.apache.nutch.util.IdentityPag
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.ToolUtil;
import org.apache.gora.mapreduce.GoraMapper;
@@ -50,6 +53,8 @@ public class ParserJob extends NutchTool
private static final String RESUME_KEY = "parse.job.resume";
private static final String FORCE_KEY = "parse.job.force";
+
+ public static final String SKIP_TRUNCATED = "parser.skip.truncated";
private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
@@ -64,6 +69,7 @@ public class ParserJob extends NutchTool
FIELDS.add(WebPage.Field.PARSE_STATUS);
FIELDS.add(WebPage.Field.OUTLINKS);
FIELDS.add(WebPage.Field.METADATA);
+ FIELDS.add(WebPage.Field.HEADERS);
}
@@ -76,6 +82,8 @@ public class ParserJob extends NutchTool
private boolean force;
private Utf8 batchId;
+
+ private boolean skipTruncated;
@Override
public void setup(Context context) throws IOException {
@@ -84,6 +92,7 @@ public class ParserJob extends NutchTool
shouldResume = conf.getBoolean(RESUME_KEY, false);
force = conf.getBoolean(FORCE_KEY, false);
batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
+ skipTruncated=conf.getBoolean(SKIP_TRUNCATED, true);
}
@Override
@@ -106,6 +115,13 @@ public class ParserJob extends NutchTool
LOG.info("Parsing " + unreverseKey);
}
+ if (skipTruncated) {
+ if (isTruncated(unreverseKey, page)) {
+ return;
+ }
+ }
+
+
URLWebPage redirectedPage = parseUtil.process(key, page);
ParseStatus pstatus = page.getParseStatus();
if (pstatus != null) {
@@ -128,6 +144,42 @@ public class ParserJob extends NutchTool
public ParserJob(Configuration conf) {
setConf(conf);
}
+
+ /**
+ * Checks if the page's content is truncated.
+ * @param url
+ * @param page
+ * @return If the page is truncated <code>true</code>. When it is not,
+ * or when it could be determined, <code>false</code>.
+ */
+ public static boolean isTruncated(String url, WebPage page) {
+ ByteBuffer content = page.getContent();
+ if (content == null) {
+ return false;
+ }
+ Utf8 lengthUtf8 = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_LENGTH));
+ if (lengthUtf8 == null) {
+ return false;
+ }
+ String lengthStr = lengthUtf8.toString().trim();
+ if (StringUtil.isEmpty(lengthStr)) {
+ return false;
+ }
+ int contentLength;
+ try {
+ contentLength = Integer.parseInt(lengthStr);
+ } catch (NumberFormatException e) {
+ LOG.warn("Wrong contentlength format for " + url, e);
+ return false;
+ }
+ if (contentLength > content.limit()) {
+ LOG.info(url + " skipped. Content of size " + contentLength
+ + " was truncated to " + content.limit());
+ return true;
+ }
+ LOG.info(url + " actual=" + content.limit() + " inHeader=" + contentLength);
+ return false;
+ }
public Collection<WebPage.Field> getFields(Job job) {
Configuration conf = job.getConfiguration();