You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2007/06/24 12:04:33 UTC
svn commit: r550196 - in /lucene/nutch/trunk: ./
src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/
src/test/ src/test/org/apache/nutch/fetcher/
src/testresources/fetch-test-site/
Author: dogacan
Date: Sun Jun 24 03:04:30 2007
New Revision: 550196
URL: http://svn.apache.org/viewvc?view=rev&rev=550196
Log:
NUTCH-504 - Parsing during fetching is broken.
Added:
lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/test/crawl-tests.xml
lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=550196&r1=550195&r2=550196
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Jun 24 03:04:30 2007
@@ -62,6 +62,8 @@
19. NUTCH-468 - Scoring filter should distribute score to all outlinks at
once. (dogacan)
+20. NUTCH-504 - NUTCH-443 broke parsing during fetching. (dogacan)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=550196&r1=550195&r2=550196
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sun Jun 24 03:04:30 2007
@@ -311,41 +311,7 @@
LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
}
- if (parseResult != null) {
- for (Entry<Text, Parse> entry : parseResult) {
- Text url = entry.getKey();
- Parse parse = entry.getValue();
- ParseStatus parseStatus = parse.getData().getStatus();
-
- if (!parseStatus.isSuccess()) {
- LOG.warn("Error parsing: " + key + ": " + parseStatus);
- parse = parseStatus.getEmptyParse(getConf());
- }
-
- // Calculate page signature. For non-parsing fetchers this will
- // be done in ParseSegment
- byte[] signature =
- SignatureFactory.getSignature(getConf()).calculate(content, parse);
- // Ensure segment name and score are in parseData metadata
- parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
- segmentName);
- parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
- StringUtil.toHexString(signature));
- // Pass fetch time to content meta
- parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
- Long.toString(datum.getFetchTime()));
- if (url.equals(key))
- datum.setSignature(signature);
- try {
- scfilters.passScoreAfterParsing(url, content, parse);
- } catch (Exception e) {
- if (LOG.isWarnEnabled()) {
- e.printStackTrace(LogUtil.getWarnStream(LOG));
- LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
- }
- }
- }
- } else {
+ if (parseResult == null) {
byte[] signature =
SignatureFactory.getSignature(getConf()).calculate(content,
new ParseStatus().getEmptyParse(conf));
@@ -360,8 +326,40 @@
output.collect(key, new ObjectWritable(content));
if (parseResult != null) {
for (Entry<Text, Parse> entry : parseResult) {
- output.collect(entry.getKey(),
- new ObjectWritable(new ParseImpl(entry.getValue())));
+ Text url = entry.getKey();
+ Parse parse = entry.getValue();
+ ParseStatus parseStatus = parse.getData().getStatus();
+
+ if (!parseStatus.isSuccess()) {
+ LOG.warn("Error parsing: " + key + ": " + parseStatus);
+ parse = parseStatus.getEmptyParse(getConf());
+ }
+
+ // Calculate page signature. For non-parsing fetchers this will
+ // be done in ParseSegment
+ byte[] signature =
+ SignatureFactory.getSignature(getConf()).calculate(content, parse);
+ // Ensure segment name and score are in parseData metadata
+ parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
+ segmentName);
+ parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
+ StringUtil.toHexString(signature));
+ // Pass fetch time to content meta
+ parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
+ Long.toString(datum.getFetchTime()));
+ if (url.equals(key))
+ datum.setSignature(signature);
+ try {
+ scfilters.passScoreAfterParsing(url, content, parse);
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ e.printStackTrace(LogUtil.getWarnStream(LOG));
+ LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+ }
+ }
+ output.collect(url, new ObjectWritable(
+ new ParseImpl(new ParseText(parse.getText()),
+ parse.getData(), parse.isCanonical())));
}
}
} catch (IOException e) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=diff&rev=550196&r1=550195&r2=550196
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Sun Jun 24 03:04:30 2007
@@ -685,41 +685,7 @@
LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
}
- if (parseResult != null) {
- for (Entry<Text, Parse> entry : parseResult) {
- Text url = entry.getKey();
- Parse parse = entry.getValue();
- ParseStatus parseStatus = parse.getData().getStatus();
-
- if (!parseStatus.isSuccess()) {
- LOG.warn("Error parsing: " + key + ": " + parseStatus);
- parse = parseStatus.getEmptyParse(getConf());
- }
-
- // Calculate page signature. For non-parsing fetchers this will
- // be done in ParseSegment
- byte[] signature =
- SignatureFactory.getSignature(getConf()).calculate(content, parse);
- // Ensure segment name and score are in parseData metadata
- parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
- segmentName);
- parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
- StringUtil.toHexString(signature));
- // Pass fetch time to content meta
- parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
- Long.toString(datum.getFetchTime()));
- if (url.equals(key))
- datum.setSignature(signature);
- try {
- scfilters.passScoreAfterParsing(url, content, parse);
- } catch (Exception e) {
- if (LOG.isWarnEnabled()) {
- e.printStackTrace(LogUtil.getWarnStream(LOG));
- LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
- }
- }
- }
- } else {
+ if (parseResult == null) {
byte[] signature =
SignatureFactory.getSignature(getConf()).calculate(content,
new ParseStatus().getEmptyParse(conf));
@@ -730,12 +696,44 @@
try {
output.collect(key, new ObjectWritable(datum));
- if (storingContent)
+ if (content != null && storingContent)
output.collect(key, new ObjectWritable(content));
if (parseResult != null) {
for (Entry<Text, Parse> entry : parseResult) {
- output.collect(entry.getKey(),
- new ObjectWritable(new ParseImpl(entry.getValue())));
+ Text url = entry.getKey();
+ Parse parse = entry.getValue();
+ ParseStatus parseStatus = parse.getData().getStatus();
+
+ if (!parseStatus.isSuccess()) {
+ LOG.warn("Error parsing: " + key + ": " + parseStatus);
+ parse = parseStatus.getEmptyParse(getConf());
+ }
+
+ // Calculate page signature. For non-parsing fetchers this will
+ // be done in ParseSegment
+ byte[] signature =
+ SignatureFactory.getSignature(getConf()).calculate(content, parse);
+ // Ensure segment name and score are in parseData metadata
+ parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
+ segmentName);
+ parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
+ StringUtil.toHexString(signature));
+ // Pass fetch time to content meta
+ parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
+ Long.toString(datum.getFetchTime()));
+ if (url.equals(key))
+ datum.setSignature(signature);
+ try {
+ scfilters.passScoreAfterParsing(url, content, parse);
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ e.printStackTrace(LogUtil.getWarnStream(LOG));
+ LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+ }
+ }
+ output.collect(url, new ObjectWritable(
+ new ParseImpl(new ParseText(parse.getText()),
+ parse.getData(), parse.isCanonical())));
}
}
} catch (IOException e) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=550196&r1=550195&r2=550196
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Sun Jun 24 03:04:30 2007
@@ -186,6 +186,10 @@
|| parseText == null || parseData == null) {
return; // only have inlinks
}
+
+ if (!parseData.getStatus().isSuccess()) {
+ return;
+ }
Document doc = new Document();
Metadata metadata = parseData.getContentMeta();
Modified: lucene/nutch/trunk/src/test/crawl-tests.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/crawl-tests.xml?view=diff&rev=550196&r1=550195&r2=550196
==============================================================================
--- lucene/nutch/trunk/src/test/crawl-tests.xml (original)
+++ lucene/nutch/trunk/src/test/crawl-tests.xml Sun Jun 24 03:04:30 2007
@@ -28,4 +28,10 @@
<value>test-nutch</value>
</property>
-</configuration>
\ No newline at end of file
+<property>
+ <name>http.robots.agents</name>
+ <value>test-nutch,*</value>
+</property>
+
+</configuration>
+
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?view=diff&rev=550196&r1=550195&r2=550196
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Sun Jun 24 03:04:30 2007
@@ -28,6 +28,9 @@
import org.apache.nutch.crawl.CrawlDBTestUtil;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.crawl.Injector;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
import org.mortbay.jetty.Server;
@@ -78,6 +81,7 @@
addUrl(urls,"pagea.html");
addUrl(urls,"pageb.html");
addUrl(urls,"dup_of_pagea.html");
+ addUrl(urls,"exception.html");
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
@@ -102,17 +106,17 @@
int minimumTime=(int) ((urls.size()+1)*1000*conf.getFloat("fetcher.server.delay",5));
assertTrue(time > minimumTime);
- //verify results
+ //verify content
Path content=new Path(new Path(generatedSegment, Content.DIR_NAME),"part-00000/data");
SequenceFile.Reader reader=new SequenceFile.Reader(fs, content, conf);
ArrayList<String> handledurls=new ArrayList<String>();
- READ:
+ READ_CONTENT:
do {
Text key=new Text();
Content value=new Content();
- if(!reader.next(key, value)) break READ;
+ if(!reader.next(key, value)) break READ_CONTENT;
String contentString=new String(value.getContent());
if(contentString.indexOf("Nutch fetcher test page")!=-1) {
handledurls.add(key.toString());
@@ -130,7 +134,33 @@
//verify that correct pages were handled
assertTrue(handledurls.containsAll(urls));
assertTrue(urls.containsAll(handledurls));
+
+ handledurls.clear();
+ //verify parse data
+ Path parseData = new Path(new Path(generatedSegment, ParseData.DIR_NAME),"part-00000/data");
+ reader = new SequenceFile.Reader(fs, parseData, conf);
+
+ READ_PARSE_DATA:
+ do {
+ Text key = new Text();
+ ParseData value = new ParseData();
+ if(!reader.next(key, value)) break READ_PARSE_DATA;
+ // make sure they all contain "nutch.segment.name" and "nutch.content.digest"
+ // keys in parse metadata
+ Metadata contentMeta = value.getContentMeta();
+ if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null
+ && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
+ handledurls.add(key.toString());
+ }
+ } while(true);
+
+ Collections.sort(handledurls);
+
+ assertEquals(urls.size(), handledurls.size());
+
+ assertTrue(handledurls.containsAll(urls));
+ assertTrue(urls.containsAll(handledurls));
}
private void addUrl(ArrayList<String> urls, String page) {
Added: lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html?view=auto&rev=550196
==============================================================================
--- lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html (added)
+++ lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html Sun Jun 24 03:04:30 2007
@@ -0,0 +1,13 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+<HTML>
+<HEAD>
+<TITLE>Exception</TITLE>
+<META http-equiv="Content-Type" content="text/html; charset=unicode">
+</HEAD>
+<BODY>
+!!Trying to parse this one will fail with a MalformedInputException!!
+
+Nutch fetcher test page.
+</BODY>
+</HTML>
+