You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2007/06/17 19:19:15 UTC
svn commit: r548076 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/fetcher/Fetcher.java
src/java/org/apache/nutch/fetcher/Fetcher2.java
src/java/org/apache/nutch/indexer/Indexer.java
Author: mattmann
Date: Sun Jun 17 10:19:14 2007
New Revision: 548076
URL: http://svn.apache.org/viewvc?view=rev&rev=548076
Log:
- fix for NUTCH-443 (contributed by Dogacan)
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=548076&r1=548075&r2=548076
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Jun 17 10:19:14 2007
@@ -32,6 +32,13 @@
11. NUTCH-495 - Unnecessary delays in Fetcher2 (dogacan)
+12. NUTCH-443 - allow parsers to return multiple Parse object, this will speed
+ up the rss parser (dogacan via mattmann). This update is a fix and semantics
+ change from the original patch for NUTCH-443. The original patch did not tell
+ the Indexer to read crawl_parse too so that it can pickup sub-urls' fetch
+ datums. This patch addresses that issue. Now, if Fetcher gets a null content,
+ instead of pushing an empty content, it filters the null content.
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=548076&r1=548075&r2=548076
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sun Jun 17 10:19:14 2007
@@ -288,78 +288,75 @@
datum.setFetchTime(System.currentTimeMillis());
if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
- if (content == null) {
- String url = key.toString();
- content = new Content(url, url, new byte[0], "", new Metadata(), this.conf);
- }
- Metadata metadata = content.getMetadata();
- // add segment to metadata
- metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
- // add score to content metadata so that ParseSegment can pick it up.
- try {
- scfilters.passScoreBeforeParsing(key, datum, content);
- } catch (Exception e) {
- if (LOG.isWarnEnabled()) {
- e.printStackTrace(LogUtil.getWarnStream(LOG));
- LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
- }
- }
-
- /* Note: Fetcher will only follow meta-redirects coming from the
- * original URL. */
ParseResult parseResult = null;
- if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
+ if (content != null) {
+ Metadata metadata = content.getMetadata();
+ // add segment to metadata
+ metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
+ // add score to content metadata so that ParseSegment can pick it up.
try {
- parseResult = this.parseUtil.parse(content);
+ scfilters.passScoreBeforeParsing(key, datum, content);
} catch (Exception e) {
- LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
+ if (LOG.isWarnEnabled()) {
+ e.printStackTrace(LogUtil.getWarnStream(LOG));
+ LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+ }
}
+ /* Note: Fetcher will only follow meta-redirects coming from the
+ * original URL. */
+ if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
+ try {
+ parseResult = this.parseUtil.parse(content);
+ } catch (Exception e) {
+ LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
+ }
- if (parseResult != null) {
- for (Entry<Text, Parse> entry : parseResult) {
- Text url = entry.getKey();
- Parse parse = entry.getValue();
- ParseStatus parseStatus = parse.getData().getStatus();
-
- if (!parseStatus.isSuccess()) {
- LOG.warn("Error parsing: " + key + ": " + parseStatus);
- parse = parseStatus.getEmptyParse(getConf());
- }
+ if (parseResult != null) {
+ for (Entry<Text, Parse> entry : parseResult) {
+ Text url = entry.getKey();
+ Parse parse = entry.getValue();
+ ParseStatus parseStatus = parse.getData().getStatus();
+
+ if (!parseStatus.isSuccess()) {
+ LOG.warn("Error parsing: " + key + ": " + parseStatus);
+ parse = parseStatus.getEmptyParse(getConf());
+ }
- // Calculate page signature. For non-parsing fetchers this will
- // be done in ParseSegment
- byte[] signature =
- SignatureFactory.getSignature(getConf()).calculate(content, parse);
- // Ensure segment name and score are in parseData metadata
- parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
- segmentName);
- parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
- StringUtil.toHexString(signature));
- // Pass fetch time to content meta
- parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
- Long.toString(datum.getFetchTime()));
- if (url.equals(key))
- datum.setSignature(signature);
- try {
- scfilters.passScoreAfterParsing(url, content, parse);
- } catch (Exception e) {
- if (LOG.isWarnEnabled()) {
- e.printStackTrace(LogUtil.getWarnStream(LOG));
- LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+ // Calculate page signature. For non-parsing fetchers this will
+ // be done in ParseSegment
+ byte[] signature =
+ SignatureFactory.getSignature(getConf()).calculate(content, parse);
+ // Ensure segment name and score are in parseData metadata
+ parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
+ segmentName);
+ parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
+ StringUtil.toHexString(signature));
+ // Pass fetch time to content meta
+ parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
+ Long.toString(datum.getFetchTime()));
+ if (url.equals(key))
+ datum.setSignature(signature);
+ try {
+ scfilters.passScoreAfterParsing(url, content, parse);
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ e.printStackTrace(LogUtil.getWarnStream(LOG));
+ LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+ }
}
}
+ } else {
+ byte[] signature =
+ SignatureFactory.getSignature(getConf()).calculate(content,
+ new ParseStatus().getEmptyParse(conf));
+ datum.setSignature(signature);
}
- } else {
- byte[] signature =
- SignatureFactory.getSignature(getConf()).calculate(content,
- new ParseStatus().getEmptyParse(conf));
- datum.setSignature(signature);
}
}
try {
output.collect(key, new ObjectWritable(datum));
- if (storingContent)
+ if (content != null && storingContent)
output.collect(key, new ObjectWritable(content));
if (parseResult != null) {
for (Entry<Text, Parse> entry : parseResult) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=diff&rev=548076&r1=548075&r2=548076
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Sun Jun 17 10:19:14 2007
@@ -662,72 +662,69 @@
datum.setFetchTime(System.currentTimeMillis());
if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
- if (content == null) {
- String url = key.toString();
- content = new Content(url, url, new byte[0], "", new Metadata(), this.conf);
- }
- Metadata metadata = content.getMetadata();
- // add segment to metadata
- metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
- // add score to content metadata so that ParseSegment can pick it up.
- try {
- scfilters.passScoreBeforeParsing(key, datum, content);
- } catch (Exception e) {
- if (LOG.isWarnEnabled()) {
- e.printStackTrace(LogUtil.getWarnStream(LOG));
- LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
- }
- }
-
- /* Note: Fetcher will only follow meta-redirects coming from the
- * original URL. */
ParseResult parseResult = null;
- if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
+ if (content != null) {
+ Metadata metadata = content.getMetadata();
+ // add segment to metadata
+ metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
+ // add score to content metadata so that ParseSegment can pick it up.
try {
- parseResult = this.parseUtil.parse(content);
+ scfilters.passScoreBeforeParsing(key, datum, content);
} catch (Exception e) {
- LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
+ if (LOG.isWarnEnabled()) {
+ e.printStackTrace(LogUtil.getWarnStream(LOG));
+ LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+ }
}
+ /* Note: Fetcher will only follow meta-redirects coming from the
+ * original URL. */
+ if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
+ try {
+ parseResult = this.parseUtil.parse(content);
+ } catch (Exception e) {
+ LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
+ }
- if (parseResult != null) {
- for (Entry<Text, Parse> entry : parseResult) {
- Text url = entry.getKey();
- Parse parse = entry.getValue();
- ParseStatus parseStatus = parse.getData().getStatus();
+ if (parseResult != null) {
+ for (Entry<Text, Parse> entry : parseResult) {
+ Text url = entry.getKey();
+ Parse parse = entry.getValue();
+ ParseStatus parseStatus = parse.getData().getStatus();
- if (!parseStatus.isSuccess()) {
- LOG.warn("Error parsing: " + key + ": " + parseStatus);
- parse = parseStatus.getEmptyParse(getConf());
- }
+ if (!parseStatus.isSuccess()) {
+ LOG.warn("Error parsing: " + key + ": " + parseStatus);
+ parse = parseStatus.getEmptyParse(getConf());
+ }
- // Calculate page signature. For non-parsing fetchers this will
- // be done in ParseSegment
- byte[] signature =
- SignatureFactory.getSignature(getConf()).calculate(content, parse);
- // Ensure segment name and score are in parseData metadata
- parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
- segmentName);
- parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
- StringUtil.toHexString(signature));
- // Pass fetch time to content meta
- parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
- Long.toString(datum.getFetchTime()));
- if (url.equals(key))
- datum.setSignature(signature);
- try {
- scfilters.passScoreAfterParsing(url, content, parse);
- } catch (Exception e) {
- if (LOG.isWarnEnabled()) {
- e.printStackTrace(LogUtil.getWarnStream(LOG));
- LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+ // Calculate page signature. For non-parsing fetchers this will
+ // be done in ParseSegment
+ byte[] signature =
+ SignatureFactory.getSignature(getConf()).calculate(content, parse);
+ // Ensure segment name and score are in parseData metadata
+ parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
+ segmentName);
+ parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
+ StringUtil.toHexString(signature));
+ // Pass fetch time to content meta
+ parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
+ Long.toString(datum.getFetchTime()));
+ if (url.equals(key))
+ datum.setSignature(signature);
+ try {
+ scfilters.passScoreAfterParsing(url, content, parse);
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ e.printStackTrace(LogUtil.getWarnStream(LOG));
+ LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+ }
}
}
+ } else {
+ byte[] signature =
+ SignatureFactory.getSignature(getConf()).calculate(content,
+ new ParseStatus().getEmptyParse(conf));
+ datum.setSignature(signature);
}
- } else {
- byte[] signature =
- SignatureFactory.getSignature(getConf()).calculate(content,
- new ParseStatus().getEmptyParse(conf));
- datum.setSignature(signature);
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=548076&r1=548075&r2=548076
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Sun Jun 17 10:19:14 2007
@@ -153,7 +153,6 @@
Inlinks inlinks = null;
CrawlDatum dbDatum = null;
CrawlDatum fetchDatum = null;
- CrawlDatum redir = null;
ParseData parseData = null;
ParseText parseText = null;
while (values.hasNext()) {
@@ -168,11 +167,12 @@
// don't index unmodified (empty) pages
if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED)
fetchDatum = datum;
- } else if (CrawlDatum.STATUS_LINKED == datum.getStatus())
- // redirected page
- redir = datum;
- else
+ } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
+ CrawlDatum.STATUS_SIGNATURE == datum.getStatus()) {
+ continue;
+ } else {
throw new RuntimeException("Unexpected status: "+datum.getStatus());
+ }
} else if (value instanceof ParseData) {
parseData = (ParseData)value;
} else if (value instanceof ParseText) {
@@ -181,11 +181,6 @@
LOG.warn("Unrecognized type: "+value.getClass());
}
}
- if (redir != null) {
- // XXX page was redirected - what should we do?
- // XXX discard it for now
- return;
- }
if (fetchDatum == null || dbDatum == null
|| parseText == null || parseData == null) {
@@ -260,6 +255,7 @@
LOG.info("Indexer: adding segment: " + segments[i]);
}
job.addInputPath(new Path(segments[i], CrawlDatum.FETCH_DIR_NAME));
+ job.addInputPath(new Path(segments[i], CrawlDatum.PARSE_DIR_NAME));
job.addInputPath(new Path(segments[i], ParseData.DIR_NAME));
job.addInputPath(new Path(segments[i], ParseText.DIR_NAME));
}