You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/02/24 13:44:52 UTC
svn commit: r1293228 - in /nutch/branches/nutchgora: CHANGES.txt
conf/nutch-default.xml src/java/org/apache/nutch/fetcher/FetcherReducer.java
src/java/org/apache/nutch/parse/ParserJob.java
Author: ferdy
Date: Fri Feb 24 12:44:52 2012
New Revision: 1293228
URL: http://svn.apache.org/viewvc?rev=1293228&view=rev
Log:
REVERT NUTCH-965 Skip parsing for truncated document
Modified:
nutch/branches/nutchgora/CHANGES.txt
nutch/branches/nutchgora/conf/nutch-default.xml
nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java
nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java
Modified: nutch/branches/nutchgora/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1293228&r1=1293227&r2=1293228&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Fri Feb 24 12:44:52 2012
@@ -2,8 +2,6 @@ Nutch Change Log
Release nutchgora - Current Development
-* NUTCH-965 Skip parsing for truncated documents (alexis, lewismc, ferdy)
-
* NUTCH-1287 Upgrade to hsqldb 2.2.8 (ferdy)
* NUTCH-1280 language-identifier should have option to use detected value by Tika even when uncertain (ferdy)
Modified: nutch/branches/nutchgora/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/conf/nutch-default.xml?rev=1293228&r1=1293227&r2=1293228&view=diff
==============================================================================
--- nutch/branches/nutchgora/conf/nutch-default.xml (original)
+++ nutch/branches/nutchgora/conf/nutch-default.xml Fri Feb 24 12:44:52 2012
@@ -909,14 +909,6 @@
</description>
</property>
-<property>
- <name>parser.skip.truncated</name>
- <value>true</value>
- <description>Boolean value for whether we should skip parsing for truncated documents. By default this
- property is activated due to extremely high levels of CPU which parsing can sometimes take.
- </description>
-</property>
-
<!-- urlfilter plugin properties -->
<property>
Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1293228&r1=1293227&r2=1293228&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/fetcher/FetcherReducer.java Fri Feb 24 12:44:52 2012
@@ -43,7 +43,6 @@ import org.apache.nutch.net.URLFilterExc
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParserJob;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
@@ -82,7 +81,6 @@ extends GoraReducer<IntWritable, FetchEn
private boolean parse;
private ParseUtil parseUtil;
- private boolean skipTruncated;
/**
* This class described the item to be fetched.
@@ -606,12 +604,10 @@ extends GoraReducer<IntWritable, FetchEn
String key = TableUtil.reverseUrl(fit.url);
if (parse) {
- if (!skipTruncated || (skipTruncated && !ParserJob.isTruncated(fit.url, fit.page))) {
- URLWebPage redirectedPage = parseUtil.process(key, fit.page);
- if (redirectedPage != null) {
- context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
- redirectedPage.getDatum());
- }
+ URLWebPage redirectedPage = parseUtil.process(key, fit.page);
+ if (redirectedPage != null) {
+ context.write(TableUtil.reverseUrl(redirectedPage.getUrl()),
+ redirectedPage.getDatum());
}
}
context.write(key, fit.page);
@@ -727,7 +723,6 @@ extends GoraReducer<IntWritable, FetchEn
int threadCount = conf.getInt("fetcher.threads.fetch", 10);
parse = conf.getBoolean(FetcherJob.PARSE_KEY, false);
if (parse) {
- skipTruncated=conf.getBoolean(ParserJob.SKIP_TRUNCATED, true);
parseUtil = new ParseUtil(conf);
}
LOG.info("Fetcher: threads: " + threadCount);
Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java?rev=1293228&r1=1293227&r2=1293228&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserJob.java Fri Feb 24 12:44:52 2012
@@ -17,7 +17,6 @@
package org.apache.nutch.parse;
import java.io.IOException;
-import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
@@ -32,7 +31,6 @@ import org.apache.hadoop.util.ToolRunner
import org.apache.nutch.crawl.GeneratorJob;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.crawl.URLWebPage;
-import org.apache.nutch.metadata.HttpHeaders;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.storage.Mark;
import org.apache.nutch.storage.ParseStatus;
@@ -42,7 +40,6 @@ import org.apache.nutch.util.IdentityPag
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
-import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.ToolUtil;
import org.apache.gora.mapreduce.GoraMapper;
@@ -53,8 +50,6 @@ public class ParserJob extends NutchTool
private static final String RESUME_KEY = "parse.job.resume";
private static final String FORCE_KEY = "parse.job.force";
-
- public static final String SKIP_TRUNCATED = "parser.skip.truncated";
private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
@@ -69,7 +64,6 @@ public class ParserJob extends NutchTool
FIELDS.add(WebPage.Field.PARSE_STATUS);
FIELDS.add(WebPage.Field.OUTLINKS);
FIELDS.add(WebPage.Field.METADATA);
- FIELDS.add(WebPage.Field.HEADERS);
}
@@ -82,8 +76,6 @@ public class ParserJob extends NutchTool
private boolean force;
private Utf8 batchId;
-
- private boolean skipTruncated;
@Override
public void setup(Context context) throws IOException {
@@ -92,7 +84,6 @@ public class ParserJob extends NutchTool
shouldResume = conf.getBoolean(RESUME_KEY, false);
force = conf.getBoolean(FORCE_KEY, false);
batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
- skipTruncated=conf.getBoolean(SKIP_TRUNCATED, true);
}
@Override
@@ -115,11 +106,6 @@ public class ParserJob extends NutchTool
LOG.info("Parsing " + unreverseKey);
}
- if (skipTruncated && isTruncated(unreverseKey, page)) {
- return;
- }
-
-
URLWebPage redirectedPage = parseUtil.process(key, page);
ParseStatus pstatus = page.getParseStatus();
if (pstatus != null) {
@@ -142,45 +128,6 @@ public class ParserJob extends NutchTool
public ParserJob(Configuration conf) {
setConf(conf);
}
-
- /**
- * Checks if the page's content is truncated.
- * @param url
- * @param page
- * @return If the page is truncated <code>true</code>. When it is not,
- * or when it could be determined, <code>false</code>.
- */
- public static boolean isTruncated(String url, WebPage page) {
- ByteBuffer content = page.getContent();
- if (content == null) {
- return false;
- }
- Utf8 lengthUtf8 = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_LENGTH));
- if (lengthUtf8 == null) {
- return false;
- }
- String lengthStr = lengthUtf8.toString().trim();
- if (StringUtil.isEmpty(lengthStr)) {
- return false;
- }
- int inHeaderSize;
- try {
- inHeaderSize = Integer.parseInt(lengthStr);
- } catch (NumberFormatException e) {
- LOG.warn("Wrong contentlength format for " + url, e);
- return false;
- }
- int actualSize = content.limit();
- if (inHeaderSize > actualSize) {
- LOG.warn(url + " skipped. Content of size " + inHeaderSize
- + " was truncated to " + actualSize);
- return true;
- }
- if (LOG.isDebugEnabled()) {
- LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize=" + inHeaderSize);
- }
- return false;
- }
public Collection<WebPage.Field> getFields(Job job) {
Configuration conf = job.getConfiguration();