You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2007/05/09 20:01:00 UTC
svn commit: r536606 - in /lucene/nutch/trunk: ./
src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/metadata/
src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/util/
src/plugin/creativecommons/src/test/org/creativecommons/nutch/ src/...
Author: ab
Date: Wed May 9 11:00:56 2007
New Revision: 536606
URL: http://svn.apache.org/viewvc?view=rev&rev=536606
Log:
NUTCH-443 - Allow parsers to return multiple Parse objects.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java
lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed May 9 11:00:56 2007
@@ -4,6 +4,9 @@
1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab)
+ 2. NUTCH-443 - Allow parsers to return multiple Parse objects.
+ (Dogacan Guney et al, via ab)
+
Release 0.9 - 2007-04-02
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed May 9 11:00:56 2007
@@ -18,6 +18,7 @@
package org.apache.nutch.fetcher;
import java.io.IOException;
+import java.util.Map.Entry;
// Commons Logging imports
import org.apache.commons.logging.Log;
@@ -301,54 +302,83 @@
}
}
- Parse parse = null;
+ /* Note: Fetcher will only follow meta-redirects coming from the
+ * original URL. */
+ ParseResult parseResult = null;
if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
- ParseStatus parseStatus;
try {
- parse = this.parseUtil.parse(content);
- parseStatus = parse.getData().getStatus();
+ parseResult = this.parseUtil.parse(content);
} catch (Exception e) {
- parseStatus = new ParseStatus(e);
+ LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
}
- if (!parseStatus.isSuccess()) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("Error parsing: " + key + ": " + parseStatus);
- }
- parse = parseStatus.getEmptyParse(getConf());
- }
- // Calculate page signature. For non-parsing fetchers this will
- // be done in ParseSegment
- byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
- metadata.set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
- datum.setSignature(signature);
- // Ensure segment name and score are in parseData metadata
- parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
- parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
- try {
- scfilters.passScoreAfterParsing(key, content, parse);
- } catch (Exception e) {
- if (LOG.isWarnEnabled()) {
- e.printStackTrace(LogUtil.getWarnStream(LOG));
- LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+
+ if (parseResult != null) {
+ for (Entry<Text, Parse> entry : parseResult) {
+ Text url = entry.getKey();
+ Parse parse = entry.getValue();
+ ParseStatus parseStatus = parse.getData().getStatus();
+
+ if (!parseStatus.isSuccess()) {
+ LOG.warn("Error parsing: " + key + ": " + parseStatus);
+ parse = parseStatus.getEmptyParse(getConf());
+ }
+
+ // Calculate page signature. For non-parsing fetchers this will
+ // be done in ParseSegment
+ byte[] signature =
+ SignatureFactory.getSignature(getConf()).calculate(content, parse);
+ // Ensure segment name and score are in parseData metadata
+ parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
+ segmentName);
+ parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
+ StringUtil.toHexString(signature));
+ // Pass fetch time to content meta
+ parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
+ Long.toString(datum.getFetchTime()));
+ if (url.equals(key))
+ datum.setSignature(signature);
+ try {
+ scfilters.passScoreAfterParsing(url, content, parse);
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ e.printStackTrace(LogUtil.getWarnStream(LOG));
+ LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+ }
+ }
}
+ } else {
+ byte[] signature =
+ SignatureFactory.getSignature(getConf()).calculate(content,
+ new ParseStatus().getEmptyParse(conf));
+ datum.setSignature(signature);
}
-
}
try {
- output.collect
- (key,
- new FetcherOutput(datum,
- storingContent ? content : null,
- parse != null ? new ParseImpl(parse) : null));
+ output.collect(key, new ObjectWritable(datum));
+ if (storingContent)
+ output.collect(key, new ObjectWritable(content));
+ if (parseResult != null) {
+ for (Entry<Text, Parse> entry : parseResult) {
+ output.collect(entry.getKey(),
+ new ObjectWritable(new ParseImpl(entry.getValue())));
+ }
+ }
} catch (IOException e) {
if (LOG.isFatalEnabled()) {
e.printStackTrace(LogUtil.getFatalStream(LOG));
LOG.fatal("fetcher caught:"+e.toString());
}
}
- if (parse != null) return parse.getData().getStatus();
- else return null;
+
+ // return parse status if it exits
+ if (parseResult != null && !parseResult.isEmpty()) {
+ Parse p = parseResult.get(content.getUrl());
+ if (p != null) {
+ return p.getData().getStatus();
+ }
+ }
+ return null;
}
}
@@ -465,7 +495,7 @@
job.setOutputPath(segment);
job.setOutputFormat(FetcherOutputFormat.class);
job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(FetcherOutput.class);
+ job.setOutputValueClass(ObjectWritable.class);
JobClient.runJob(job);
if (LOG.isInfoEnabled()) { LOG.info("Fetcher: done"); }
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Wed May 9 11:00:56 2007
@@ -21,6 +21,7 @@
import java.net.URL;
import java.net.UnknownHostException;
import java.util.*;
+import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
@@ -32,6 +33,7 @@
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.SignatureFactory;
@@ -662,54 +664,83 @@
}
}
- Parse parse = null;
+ /* Note: Fetcher will only follow meta-redirects coming from the
+ * original URL. */
+ ParseResult parseResult = null;
if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
- ParseStatus parseStatus;
try {
- parse = this.parseUtil.parse(content);
- parseStatus = parse.getData().getStatus();
+ parseResult = this.parseUtil.parse(content);
} catch (Exception e) {
- parseStatus = new ParseStatus(e);
+ LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
}
- if (!parseStatus.isSuccess()) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("Error parsing: " + key + ": " + parseStatus);
- }
- parse = parseStatus.getEmptyParse(getConf());
- }
- // Calculate page signature. For non-parsing fetchers this will
- // be done in ParseSegment
- byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
- metadata.set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
- datum.setSignature(signature);
- // Ensure segment name and score are in parseData metadata
- parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
- parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
- try {
- scfilters.passScoreAfterParsing(key, content, parse);
- } catch (Exception e) {
- if (LOG.isWarnEnabled()) {
- e.printStackTrace(LogUtil.getWarnStream(LOG));
- LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+
+ if (parseResult != null) {
+ for (Entry<Text, Parse> entry : parseResult) {
+ Text url = entry.getKey();
+ Parse parse = entry.getValue();
+ ParseStatus parseStatus = parse.getData().getStatus();
+
+ if (!parseStatus.isSuccess()) {
+ LOG.warn("Error parsing: " + key + ": " + parseStatus);
+ parse = parseStatus.getEmptyParse(getConf());
+ }
+
+ // Calculate page signature. For non-parsing fetchers this will
+ // be done in ParseSegment
+ byte[] signature =
+ SignatureFactory.getSignature(getConf()).calculate(content, parse);
+ // Ensure segment name and score are in parseData metadata
+ parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
+ segmentName);
+ parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
+ StringUtil.toHexString(signature));
+ // Pass fetch time to content meta
+ parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
+ Long.toString(datum.getFetchTime()));
+ if (url.equals(key))
+ datum.setSignature(signature);
+ try {
+ scfilters.passScoreAfterParsing(url, content, parse);
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ e.printStackTrace(LogUtil.getWarnStream(LOG));
+ LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+ }
+ }
}
+ } else {
+ byte[] signature =
+ SignatureFactory.getSignature(getConf()).calculate(content,
+ new ParseStatus().getEmptyParse(conf));
+ datum.setSignature(signature);
}
-
}
try {
- output.collect
- (key,
- new FetcherOutput(datum,
- storingContent ? content : null,
- parse != null ? new ParseImpl(parse) : null));
+ output.collect(key, new ObjectWritable(datum));
+ if (storingContent)
+ output.collect(key, new ObjectWritable(content));
+ if (parseResult != null) {
+ for (Entry<Text, Parse> entry : parseResult) {
+ output.collect(entry.getKey(),
+ new ObjectWritable(new ParseImpl(entry.getValue())));
+ }
+ }
} catch (IOException e) {
if (LOG.isFatalEnabled()) {
e.printStackTrace(LogUtil.getFatalStream(LOG));
LOG.fatal("fetcher caught:"+e.toString());
}
}
- if (parse != null) return parse.getData().getStatus();
- else return null;
+
+ // return parse status if it exits
+ if (parseResult != null && !parseResult.isEmpty()) {
+ Parse p = parseResult.get(content.getUrl());
+ if (p != null) {
+ return p.getData().getStatus();
+ }
+ }
+ return null;
}
}
@@ -832,7 +863,7 @@
job.setOutputPath(segment);
job.setOutputFormat(FetcherOutputFormat.class);
job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(FetcherOutput.class);
+ job.setOutputValueClass(ObjectWritable.class);
JobClient.runJob(job);
if (LOG.isInfoEnabled()) { LOG.info("Fetcher: done"); }
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java Wed May 9 11:00:56 2007
@@ -24,6 +24,7 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.Text;
@@ -34,6 +35,7 @@
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Progressable;
+import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseOutputFormat;
import org.apache.nutch.protocol.Content;
@@ -76,18 +78,14 @@
public void write(WritableComparable key, Writable value)
throws IOException {
- FetcherOutput fo = (FetcherOutput)value;
+ Writable w = (Writable)((ObjectWritable)value).get();
- fetchOut.append(key, fo.getCrawlDatum());
-
- if (fo.getContent() != null) {
- contentOut.append(key, fo.getContent());
- }
-
- if (fo.getParse() != null) {
- parseOut.write(key, fo.getParse());
- }
-
+ if (w instanceof CrawlDatum)
+ fetchOut.append(key, w);
+ else if (w instanceof Content)
+ contentOut.append(key, w);
+ else if (w instanceof Parse)
+ parseOut.write(key, w);
}
public void close(Reporter reporter) throws IOException {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Wed May 9 11:00:56 2007
@@ -46,6 +46,8 @@
public static final String PROTO_STATUS_KEY = "_pst_";
public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(PROTO_STATUS_KEY);
+
+ public static final String FETCH_TIME_KEY = "_ftk_";
/** Sites may request that search engines don't provide access to cached documents. */
public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java Wed May 9 11:00:56 2007
@@ -22,6 +22,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.plugin.*;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
import org.w3c.dom.DocumentFragment;
@@ -56,13 +57,20 @@
}
/** Run all defined filters. */
- public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
+ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
- for (int i = 0 ; i < this.htmlParseFilters.length; i++) {
- parse = this.htmlParseFilters[i].filter(content, parse, metaTags, doc);
- if (!parse.getData().getStatus().isSuccess()) break;
+ ParseResult filteredParseResult = new ParseResult(content.getUrl());
+
+ for (java.util.Map.Entry<Text, Parse> entry : parseResult) {
+ Parse parse = entry.getValue();
+ for (int i = 0 ; i < this.htmlParseFilters.length; i++) {
+ parse = this.htmlParseFilters[i].filter(content, parse, metaTags, doc);
+ if (!parse.getData().getStatus().isSuccess()) break;
+ }
+ filteredParseResult.put(entry.getKey(),
+ new ParseText(parse.getText()), parse.getData());
}
- return parse;
+ return filteredParseResult;
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java Wed May 9 11:00:56 2007
@@ -28,4 +28,7 @@
/** Other data extracted from the page. */
ParseData getData();
+
+ /** Indicates if the parse is coming from a url or a sub-url */
+ boolean isCanonical();
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java Wed May 9 11:00:56 2007
@@ -29,33 +29,43 @@
public class ParseImpl implements Parse, Writable, Configurable {
private ParseText text;
private ParseData data;
+ private boolean isCanonical;
private Configuration conf;
public ParseImpl() {}
public ParseImpl(Parse parse) {
- this(parse.getText(), parse.getData());
+ this(new ParseText(parse.getText()), parse.getData(), true);
}
public ParseImpl(String text, ParseData data) {
- this(new ParseText(text), data);
+ this(new ParseText(text), data, true);
}
-
+
public ParseImpl(ParseText text, ParseData data) {
+ this(text, data, true);
+ }
+
+ public ParseImpl(ParseText text, ParseData data, boolean isCanonical) {
this.text = text;
this.data = data;
+ this.isCanonical = isCanonical;
}
public String getText() { return text.getText(); }
public ParseData getData() { return data; }
+
+ public boolean isCanonical() { return isCanonical; }
public final void write(DataOutput out) throws IOException {
+ out.writeBoolean(isCanonical);
text.write(out);
data.write(out);
}
public void readFields(DataInput in) throws IOException {
+ isCanonical = in.readBoolean();
text = new ParseText();
text.readFields(in);
@@ -79,5 +89,6 @@
public Configuration getConf() {
return this.conf;
}
+
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Wed May 9 11:00:56 2007
@@ -159,6 +159,18 @@
if (adjust != null) crawlOut.append(key, adjust);
}
dataOut.append(key, parseData);
+ if (!parse.isCanonical()) {
+ CrawlDatum datum = new CrawlDatum();
+ datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
+ String timeString = parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY);
+ try {
+ datum.setFetchTime(Long.parseLong(timeString));
+ } catch (Exception e) {
+ LOG.warn("Can't read fetch time for: " + key);
+ datum.setFetchTime(System.currentTimeMillis());
+ }
+ crawlOut.append(key, datum);
+ }
}
public void close(Reporter reporter) throws IOException {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Wed May 9 11:00:56 2007
@@ -23,6 +23,7 @@
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.conf.*;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.*;
@@ -33,6 +34,7 @@
import java.io.*;
import java.util.*;
+import java.util.Map.Entry;
/* Parse content in a segment. */
public class ParseSegment extends Configured implements Mapper, Reducer {
@@ -69,35 +71,44 @@
Content content = (Content) value;
content.forceInflate();
- Parse parse = null;
- ParseStatus status;
+ ParseResult parseResult = null;
try {
- parse = new ParseUtil(getConf()).parse(content);
- status = parse.getData().getStatus();
+ parseResult = new ParseUtil(getConf()).parse(content);
} catch (Exception e) {
- status = new ParseStatus(e);
+ LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
+ return;
}
- // compute the new signature
- byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
- if (parse != null) {
- parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
- parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, getConf().get(Nutch.SEGMENT_NAME_KEY));
- }
-
- if (status.isSuccess()) {
+ for (Entry<Text, Parse> entry : parseResult) {
+ Text url = entry.getKey();
+ Parse parse = entry.getValue();
+ ParseStatus parseStatus = parse.getData().getStatus();
+
+ if (!parseStatus.isSuccess()) {
+ LOG.warn("Error parsing: " + key + ": " + parseStatus);
+ parse = parseStatus.getEmptyParse(getConf());
+ }
+
+ // pass segment name to parse data
+ parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
+ getConf().get(Nutch.SEGMENT_NAME_KEY));
+
+ // compute the new signature
+ byte[] signature =
+ SignatureFactory.getSignature(getConf()).calculate(content, parse);
+ parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
+ StringUtil.toHexString(signature));
+
try {
- scfilters.passScoreAfterParsing((Text)key, content, parse);
+ scfilters.passScoreAfterParsing(url, content, parse);
} catch (ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
e.printStackTrace(LogUtil.getWarnStream(LOG));
- LOG.warn("Error passing score: "+key+": "+e.getMessage());
+ LOG.warn("Error passing score: "+ url +": "+e.getMessage());
}
- return;
}
- output.collect(key, new ParseImpl(parse.getText(), parse.getData()));
- } else if (LOG.isWarnEnabled()) {
- LOG.warn("Error parsing: " + key + ": "+status.toString());
+ output.collect(url, new ParseImpl(new ParseText(parse.getText()),
+ parse.getData(), parse.isCanonical()));
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java Wed May 9 11:00:56 2007
@@ -185,6 +185,13 @@
return new EmptyParseImpl(this, conf);
}
+ /** A convenience method. Creates an empty ParseResult,
+ * which contains this status.
+ */
+ public ParseResult getEmptyParseResult(String url, Configuration conf) {
+ return ParseResult.createParseResult(url, getEmptyParse(conf));
+ }
+
public String toString() {
StringBuffer res = new StringBuffer();
String name = null;
@@ -260,6 +267,10 @@
public String getText() {
return "";
+ }
+
+ public boolean isCanonical() {
+ return true;
}
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java Wed May 9 11:00:56 2007
@@ -59,10 +59,10 @@
* <code>WARNING</code> level, and an empty parse is returned.
*
* @param content The content to try and parse.
- * @return A {@link Parse} object containing the parsed data.
+ * @return <key, {@link Parse}> pairs.
* @throws ParseException If no suitable parser is found to perform the parse.
*/
- public Parse parse(Content content) throws ParseException {
+ public ParseResult parse(Content content) throws ParseException {
Parser[] parsers = null;
try {
@@ -76,25 +76,21 @@
throw new ParseException(e.getMessage());
}
- Parse parse = null;
+ ParseResult parseResult = null;
for (int i=0; i<parsers.length; i++) {
if (LOG.isDebugEnabled()) {
LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i] + "]");
}
- parse = parsers[i].getParse(content);
- if ((parse != null) && (parse.getData().getStatus().isSuccess())) {
- return parse;
- }
+ parseResult = parsers[i].getParse(content);
+ if (parseResult != null && !parseResult.isEmpty())
+ return parseResult;
}
if (LOG.isWarnEnabled()) {
LOG.warn("Unable to successfully parse content " + content.getUrl() +
" of type " + content.getContentType());
}
-
- ParseStatus ps = (parse.getData() != null) ? parse.getData().getStatus() : null;
- return (ps == null) ? new ParseStatus().getEmptyParse(this.conf)
- : ps.getEmptyParse(this.conf);
+ return null;
}
/**
@@ -110,15 +106,14 @@
* to parse the specified content.
* @param content The content to parse.
*
- * @return A {@link Parse} object if the parse is successful, otherwise,
- * a <code>ParseStatus.getEmptyParse()</code>.
+ * @return <key, {@link Parse}> pairs if the parse is successful, otherwise,
+ * a single <key, <code>ParseStatus.getEmptyParse()</code>> pair.
*
* @throws ParseException If there is no suitable {@link Parser} found
* to perform the parse.
*/
- public Parse parseByExtensionId(String extId, Content content)
+ public ParseResult parseByExtensionId(String extId, Content content)
throws ParseException {
- Parse parse = null;
Parser p = null;
try {
@@ -131,16 +126,15 @@
throw new ParseException(e.getMessage());
}
- parse = p.getParse(content);
-
- if (parse != null && parse.getData().getStatus().isSuccess()) {
- return parse;
+ ParseResult parseResult = p.getParse(content);
+ if (parseResult != null && !parseResult.isEmpty()) {
+ return parseResult;
} else {
if (LOG.isWarnEnabled()) {
LOG.warn("Unable to successfully parse content " + content.getUrl() +
- " of type " + content.getContentType());
- }
- return new ParseStatus().getEmptyParse(this.conf);
+ " of type " + content.getContentType());
+ }
+ return null;
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java Wed May 9 11:00:56 2007
@@ -32,6 +32,25 @@
/** The name of the extension point. */
public final static String X_POINT_ID = Parser.class.getName();
- /** Creates the parse for some content. */
- Parse getParse(Content c);
+ /**
+ * <p>
+ * This method parses the given content and returns a map of
+ * <key, parse> pairs. {@link Parse} instances will be persisted
+ * under the given key.
+ * </p>
+ * <p>
+ * Note: Meta-redirects should be followed only when they are coming from
+ * the original URL. That is: <br>
+ * Assume fetcher is in parsing mode and is currently processing
+ * foo.bar.com/redirect.html. If this url contains a meta redirect
+ * to another url, fetcher should only follow the redirect if the map
+ * contains an entry of the form <"foo.bar.com/redirect.html",
+ * {@link Parse} with a {@link ParseStatus} indicating the redirect>.
+ * </p>
+ *
+ * @param c Content to be parsed
+ * @return a map containing <key, parse> pairs
+ * @since NUTCH-443
+ */
+ ParseResult getParse(Content c);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed May 9 11:00:56 2007
@@ -94,13 +94,18 @@
LOG.info("contentType: "+contentType);
}
- Parse parse = new ParseUtil(conf).parse(content);
+ ParseResult parseResult = new ParseUtil(conf).parse(content);
- System.out.print("---------\nParseData\n---------\n");
- System.out.print(parse.getData().toString());
- if (dumpText) {
- System.out.print("---------\nParseText\n---------\n");
- System.out.print(parse.getText());
+ for (java.util.Map.Entry<Text, Parse> entry : parseResult) {
+ Parse parse = entry.getValue();
+ System.out.print("---------\nUrl\n---------------\n");
+ System.out.print(entry.getKey());
+ System.out.print("---------\nParseData\n---------\n");
+ System.out.print(parse.getData().toString());
+ if (dumpText) {
+ System.out.print("---------\nParseText\n---------\n");
+ System.out.print(parse.getText());
+ }
}
System.exit(0);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java Wed May 9 11:00:56 2007
@@ -39,8 +39,6 @@
// for back-compatibility, add old aliases for these Writable classes
// this may be removed after the 0.8 release
static {
- WritableName.addName(org.apache.nutch.fetcher.FetcherOutput.class,
- "FetcherOutput");
WritableName.addName(org.apache.nutch.parse.ParseData.class, "ParseData");
WritableName.addName(org.apache.nutch.parse.ParseText.class, "ParseText");
WritableName.addName(org.apache.nutch.protocol.Content.class, "Content");
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Wed May 9 11:00:56 2007
@@ -62,7 +62,7 @@
Content content =
new Content(url, url, bytes, contentType, new Metadata(), conf);
- Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html",content);
+ Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html", content).get(content.getUrl());
Metadata metadata = parse.getData().getParseMeta();
assertEquals(license, metadata.get("License-Url"));
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Wed May 9 11:00:56 2007
@@ -346,7 +346,7 @@
try {
protocol = new ProtocolFactory(conf).getProtocol(url);
Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
- Parse parse = new ParseUtil(conf).parse(content);
+ Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
System.out.println("text:" + parse.getText());
return parse.getText();
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Wed May 9 11:00:56 2007
@@ -24,7 +24,6 @@
// Nutch imports
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
@@ -53,7 +52,7 @@
/* loop through the test documents and validate result */
for (int t = 0; t < docs.length; t++) {
Content content = getContent(docs[t]);
- Parse parse = parser.parse(content);
+ Parse parse = parser.parse(content).get(content.getUrl());
assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
}
} catch (Exception e) {
Modified: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java Wed May 9 11:00:56 2007
@@ -35,9 +35,9 @@
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.OutlinkExtractor;
-import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
@@ -61,7 +61,7 @@
* Parses a Content with a specific {@link MSExtractor Microsoft document
* extractor}.
*/
- protected Parse getParse(MSExtractor extractor, Content content) {
+ protected ParseResult getParse(MSExtractor extractor, Content content) {
String text = null;
String title = null;
@@ -77,7 +77,7 @@
ParseStatus.FAILED_TRUNCATED,
"Content truncated at " + raw.length +" bytes. " +
"Parser can't handle incomplete file.")
- .getEmptyParse(this.conf);
+ .getEmptyParseResult(content.getUrl(), this.conf);
}
extractor.extract(new ByteArrayInputStream(raw));
text = extractor.getText();
@@ -87,7 +87,7 @@
} catch (Exception e) {
return new ParseStatus(ParseStatus.FAILED,
"Can't be handled as Microsoft document. " + e)
- .getEmptyParse(this.conf);
+ .getEmptyParseResult(content.getUrl(), this.conf);
}
// collect meta data
@@ -105,7 +105,8 @@
outlinks, content.getMetadata(),
metadata);
parseData.setConf(this.conf);
- return new ParseImpl(text, parseData);
+ return ParseResult.createParseResult(content.getUrl(),
+ new ParseImpl(text, parseData));
}
@@ -127,7 +128,7 @@
Content content = new Content(file, file, raw, mime, meta,
NutchConfiguration.create());
- System.out.println(parser.getParse(content).getText());
+ System.out.println(parser.getParse(content).get(file).getText());
}
private final static byte[] getRawBytes(File f) {
Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Wed May 9 11:00:56 2007
@@ -18,6 +18,7 @@
package org.apache.nutch.parse.ext;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
@@ -65,14 +66,14 @@
public ExtParser () { }
- public Parse getParse(Content content) {
+ public ParseResult getParse(Content content) {
String contentType = content.getContentType();
String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
if (params == null)
return new ParseStatus(ParseStatus.FAILED,
- "No external command defined for contentType: " + contentType).getEmptyParse(getConf());
+ "No external command defined for contentType: " + contentType).getEmptyParseResult(content.getUrl(), getConf());
String command = params[0];
int timeout = Integer.parseInt(params[1]);
@@ -94,7 +95,7 @@
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
"Content truncated at " + raw.length
+" bytes. Parser can't handle incomplete "
- + contentType + " file.").getEmptyParse(getConf());
+ + contentType + " file.").getEmptyParseResult(content.getUrl(), getConf());
}
ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
@@ -114,12 +115,12 @@
if (cr.getExitValue() != 0)
return new ParseStatus(ParseStatus.FAILED,
"External command " + command
- + " failed with error: " + es.toString()).getEmptyParse(getConf());
+ + " failed with error: " + es.toString()).getEmptyParseResult(content.getUrl(), getConf());
text = os.toString();
} catch (Exception e) { // run time exception
- return new ParseStatus(e).getEmptyParse(getConf());
+ return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
if (text == null)
@@ -134,7 +135,8 @@
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
outlinks, content.getMetadata());
parseData.setConf(this.conf);
- return new ParseImpl(text, parseData);
+ return ParseResult.createParseResult(content.getUrl(),
+ new ParseImpl(text, parseData));
}
public void setConf(Configuration conf) {
Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java Wed May 9 11:00:56 2007
@@ -23,6 +23,7 @@
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
import org.apache.hadoop.conf.Configuration;
@@ -112,13 +113,13 @@
// check external parser that does 'cat'
contentType = "application/vnd.nutch.example.cat";
content.setContentType(contentType);
- parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
assertEquals(expectedText,parse.getText());
// check external parser that does 'md5sum'
contentType = "application/vnd.nutch.example.md5sum";
content.setContentType(contentType);
- parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
assertTrue(parse.getText().startsWith(expectedMD5sum));
}
}
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Wed May 9 11:00:56 2007
@@ -18,6 +18,7 @@
package org.apache.nutch.parse.html;
import java.util.ArrayList;
+import java.util.Map;
import java.net.URL;
import java.net.MalformedURLException;
import java.io.*;
@@ -104,14 +105,14 @@
private String cachingPolicy;
- public Parse getParse(Content content) {
+ public ParseResult getParse(Content content) {
HTMLMetaTags metaTags = new HTMLMetaTags();
URL base;
try {
base = new URL(content.getBaseUrl());
} catch (MalformedURLException e) {
- return new ParseStatus(e).getEmptyParse(getConf());
+ return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
String text = "";
@@ -167,14 +168,14 @@
if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); }
root = parse(input);
} catch (IOException e) {
- return new ParseStatus(e).getEmptyParse(getConf());
+ return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (DOMException e) {
- return new ParseStatus(e).getEmptyParse(getConf());
+ return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (SAXException e) {
- return new ParseStatus(e).getEmptyParse(getConf());
+ return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (Exception e) {
e.printStackTrace(LogUtil.getWarnStream(LOG));
- return new ParseStatus(e).getEmptyParse(getConf());
+ return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
// get meta directives
@@ -213,14 +214,18 @@
ParseData parseData = new ParseData(status, title, outlinks,
content.getMetadata(), metadata);
parseData.setConf(this.conf);
- Parse parse = new ParseImpl(text, parseData);
+ ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
+ new ParseImpl(text, parseData));
// run filters on parse
- parse = this.htmlParseFilters.filter(content, parse, metaTags, root);
+ ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult,
+ metaTags, root);
if (metaTags.getNoCache()) { // not okay to cache
- parse.getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
+ for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
+ entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY,
+ cachingPolicy);
}
- return parse;
+ return filteredParse;
}
private DocumentFragment parse(InputSource input) throws Exception {
@@ -291,7 +296,7 @@
HtmlParser parser = new HtmlParser();
parser.setConf(conf);
Parse parse = parser.getParse(
- new Content(url, url, bytes, "text/html", new Metadata(), conf));
+ new Content(url, url, bytes, "text/html", new Metadata(), conf)).get(url);
System.out.println("data: "+parse.getData());
System.out.println("text: "+parse.getText());
Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Wed May 9 11:00:56 2007
@@ -35,6 +35,7 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
@@ -141,11 +142,11 @@
}
}
- public Parse getParse(Content c) {
+ public ParseResult getParse(Content c) {
String type = c.getContentType();
if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
- "Content not JavaScript: '" + type + "'").getEmptyParse(getConf());
+ "Content not JavaScript: '" + type + "'").getEmptyParseResult(c.getUrl(), getConf());
String script = new String(c.getContent());
Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
if (outlinks == null) outlinks = new Outlink[0];
@@ -162,8 +163,7 @@
ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
c.getMetadata());
pd.setConf(this.conf);
- Parse parse = new ParseImpl(script, pd);
- return parse;
+ return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
}
private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/java/org/apache/nutch/parse/msexcel/MSExcelParser.java Wed May 9 11:00:56 2007
@@ -17,7 +17,7 @@
package org.apache.nutch.parse.msexcel;
// Nutch imports
-import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ms.MSBaseParser;
import org.apache.nutch.protocol.Content;
@@ -37,7 +37,7 @@
public static final String MIME_TYPE = "application/vnd.ms-excel";
- public Parse getParse(Content content) {
+ public ParseResult getParse(Content content) {
return getParse(new ExcelExtractor(), content);
}
Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java Wed May 9 11:00:56 2007
@@ -79,7 +79,7 @@
protocol = factory.getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
- parse = parser.parseByExtensionId("parse-msexcel", content);
+ parse = parser.parseByExtensionId("parse-msexcel", content).get(content.getUrl());
assertTrue(parse.getText().equals(expectedText));
}
Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java Wed May 9 11:00:56 2007
@@ -17,7 +17,7 @@
package org.apache.nutch.parse.mspowerpoint;
// Nutch imports
-import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ms.MSBaseParser;
import org.apache.nutch.protocol.Content;
@@ -41,7 +41,7 @@
public static final String MIME_TYPE = "application/vnd.ms-powerpoint";
- public Parse getParse(final Content content) {
+ public ParseResult getParse(final Content content) {
return getParse(new PPTExtractor(), content);
}
Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java Wed May 9 11:00:56 2007
@@ -128,7 +128,8 @@
public void testContent() throws Exception {
Parse parse = new ParseUtil(NutchConfiguration.create())
- .parseByExtensionId("parse-mspowerpoint", this.content);
+ .parseByExtensionId("parse-mspowerpoint", this.content)
+ .get(this.content.getUrl());
ParseData data = parse.getData();
String text = parse.getText();
@@ -166,7 +167,8 @@
public void testMeta() throws Exception {
Parse parse = new ParseUtil(NutchConfiguration.create())
- .parseByExtensionId("parse-mspowerpoint", content);
+ .parseByExtensionId("parse-mspowerpoint", content)
+ .get(content.getUrl());
ParseData data = parse.getData();
Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java Wed May 9 11:00:56 2007
@@ -18,7 +18,7 @@
// Nutch imports
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ms.MSBaseParser;
@@ -40,7 +40,7 @@
public static final String MIME_TYPE = "application/msword";
- public Parse getParse(Content content) {
+ public ParseResult getParse(Content content) {
return getParse(new WordExtractor(), content);
}
Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Wed May 9 11:00:56 2007
@@ -23,6 +23,7 @@
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
import org.apache.hadoop.conf.Configuration;
@@ -70,7 +71,7 @@
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content).get(content.getUrl());
assertTrue(parse.getText().startsWith(expectedText));
}
Modified: lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java Wed May 9 11:00:56 2007
@@ -60,7 +60,7 @@
return conf;
}
- public Parse getParse(Content content) {
+ public ParseResult getParse(Content content) {
String text = null;
String title = null;
Metadata metadata = new Metadata();
@@ -73,7 +73,7 @@
&& raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
"Content truncated at "+raw.length
- +" bytes. Parser can't handle incomplete files.").getEmptyParse(conf);
+ +" bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), conf);
}
ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(raw));
ZipEntry ze = null;
@@ -88,7 +88,7 @@
} catch (Exception e) { // run time exception
e.printStackTrace(LogUtil.getWarnStream(LOG));
return new ParseStatus(ParseStatus.FAILED,
- "Can't be handled as OO document. " + e).getEmptyParse(conf);
+ "Can't be handled as OO document. " + e).getEmptyParseResult(content.getUrl(), conf);
}
title = metadata.get(Metadata.TITLE);
@@ -100,7 +100,7 @@
Outlink[] links = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, links, metadata);
- return new ParseImpl(text, parseData);
+ return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}
// extract as much plain text as possible.
@@ -206,7 +206,7 @@
fis.read(bytes);
fis.close();
Content c = new Content("local", "local", bytes, "application/vnd.oasis.opendocument.text", new Metadata(), conf);
- Parse p = oo.getParse(c);
+ Parse p = oo.getParse(c).get(c.getUrl());
System.out.println(p.getData());
System.out.println("Text: '" + p.getText() + "'");
/*
Modified: lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java Wed May 9 11:00:56 2007
@@ -90,7 +90,7 @@
protocol = factory.getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
- parse = parser.getParse(content);
+ parse = parser.getParse(content).get(content.getUrl());
String text = parse.getText().replaceAll("[ \t\r\n]+", " ");
assertTrue(expectedText.equals(text));
Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Wed May 9 11:00:56 2007
@@ -34,9 +34,9 @@
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
-import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.Outlink;
@@ -66,7 +66,7 @@
public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.pdf");
private Configuration conf;
- public Parse getParse(Content content) {
+ public ParseResult getParse(Content content) {
// in memory representation of pdf file
PDDocument pdf = null;
@@ -84,7 +84,7 @@
&& raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
"Content truncated at "+raw.length
- +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse(getConf());
+ +" bytes. Parser can't handle incomplete pdf file.").getEmptyParseResult(content.getUrl(), getConf());
}
PDFParser parser = new PDFParser(new ByteArrayInputStream(raw));
@@ -121,17 +121,17 @@
} catch (CryptographyException e) {
return new ParseStatus(ParseStatus.FAILED,
- "Error decrypting document. " + e).getEmptyParse(getConf());
+ "Error decrypting document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (InvalidPasswordException e) {
return new ParseStatus(ParseStatus.FAILED,
- "Can't decrypt document - invalid password. " + e).getEmptyParse(getConf());
+ "Can't decrypt document - invalid password. " + e).getEmptyParseResult(content.getUrl(), getConf());
} catch (Exception e) { // run time exception
if (LOG.isWarnEnabled()) {
LOG.warn("General exception in PDF parser: "+e.getMessage());
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
return new ParseStatus(ParseStatus.FAILED,
- "Can't be handled as pdf document. " + e).getEmptyParse(getConf());
+ "Can't be handled as pdf document. " + e).getEmptyParseResult(content.getUrl(), getConf());
} finally {
try {
if (pdf != null)
@@ -154,7 +154,7 @@
outlinks, content.getMetadata(),
metadata);
parseData.setConf(this.conf);
- return new ParseImpl(text, parseData);
+ return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}
Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java Wed May 9 11:00:56 2007
@@ -70,7 +70,7 @@
Configuration conf = NutchConfiguration.create();
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content).get(content.getUrl());
int index = parse.getText().indexOf(expectedText);
assertTrue(index > 0);
Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java Wed May 9 11:00:56 2007
@@ -33,6 +33,7 @@
// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseStatus;
@@ -76,7 +77,7 @@
* The content to parse (hopefully an RSS content stream)
* @return A {@link ParseImpl}which implements the {@link Parse}interface.
*/
- public Parse getParse(Content content) {
+ public ParseResult getParse(Content content) {
List theRSSChannels = null;
@@ -101,7 +102,7 @@
LOG.warn("nutch:parse-rss:RSSParser Exception: " + e.getMessage());
}
return new ParseStatus(ParseStatus.FAILED,
- "Can't be handled as rss document. " + e).getEmptyParse(getConf());
+ "Can't be handled as rss document. " + e).getEmptyParseResult(content.getUrl(), getConf());
}
StringBuffer contentTitle = new StringBuffer(), indexText = new StringBuffer();
@@ -199,7 +200,7 @@
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
contentTitle.toString(), outlinks, content.getMetadata());
parseData.setConf(this.conf);
- return new ParseImpl(indexText.toString(), parseData);
+ return ParseResult.createParseResult(content.getUrl(), new ParseImpl(indexText.toString(), parseData));
}
public void setConf(Configuration conf) {
@@ -218,7 +219,7 @@
parser.setConf(conf);
Protocol protocol = new ProtocolFactory(conf).getProtocol(url);
Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
- Parse parse = parser.getParse(content);
+ Parse parse = parser.getParse(content).get(content.getUrl());
System.out.println("data: "+ parse.getData());
System.out.println("text: "+parse.getText());
}
Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java Wed May 9 11:00:56 2007
@@ -88,7 +88,7 @@
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-rss", content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-rss", content).get(content.getUrl());
//check that there are 3 outlinks:
//http://test.channel.com
Modified: lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java Wed May 9 11:00:56 2007
@@ -61,7 +61,7 @@
return conf;
}
- public Parse getParse(Content content) {
+ public ParseResult getParse(Content content) {
String text = null;
Vector outlinks = new Vector();
@@ -74,7 +74,7 @@
if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
"Content truncated at " + raw.length +
- " bytes. Parser can't handle incomplete files.").getEmptyParse(conf);
+ " bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), getConf());
}
ExtractText extractor = new ExtractText();
@@ -103,14 +103,14 @@
}
} catch (Exception e) { // run time exception
e.printStackTrace(LogUtil.getErrorStream(LOG));
- return new ParseStatus(ParseStatus.FAILED, "Can't be handled as SWF document. " + e).getEmptyParse(conf);
- } finally {}
+ return new ParseStatus(ParseStatus.FAILED, "Can't be handled as SWF document. " + e).getEmptyParseResult(content.getUrl(), getConf());
+ }
if (text == null) text = "";
Outlink[] links = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links,
content.getMetadata());
- return new ParseImpl(text, parseData);
+ return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}
/**
@@ -122,10 +122,11 @@
byte[] buf = new byte[in.available()];
in.read(buf);
SWFParser parser = new SWFParser();
- Parse p = parser.getParse(new Content("file:" + args[0], "file:" + args[0],
+ ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0],
buf, "application/x-shockwave-flash",
new Metadata(),
NutchConfiguration.create()));
+ Parse p = parseResult.get("file:" + args[0]);
System.out.println("Parse Text:");
System.out.println(p.getText());
System.out.println("Parse Data:");
Modified: lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java Wed May 9 11:00:56 2007
@@ -88,7 +88,7 @@
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parse(content);
+ parse = new ParseUtil(conf).parse(content).get(content.getUrl());
String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
assertTrue(sampleTexts[i].equals(text));
Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Wed May 9 11:00:56 2007
@@ -26,7 +26,7 @@
public class TextParser implements Parser {
private Configuration conf;
- public Parse getParse(Content content) {
+ public ParseResult getParse(Content content) {
// ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", new
// Outlink[0], metadata);
@@ -38,7 +38,7 @@
try { // try to use named encoding
text = new String(content.getContent(), encoding);
} catch (java.io.UnsupportedEncodingException e) {
- return new ParseStatus(e).getEmptyParse(getConf());
+ return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
} else {
// FIXME: implement charset detector. This code causes problem when
@@ -48,7 +48,7 @@
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "",
OutlinkExtractor.getOutlinks(text, getConf()), content.getMetadata());
parseData.setConf(this.conf);
- return new ParseImpl(text, parseData);
+ return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}
Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Wed May 9 11:00:56 2007
@@ -29,9 +29,9 @@
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
@@ -52,7 +52,7 @@
public ZipParser() {
}
- public Parse getParse(final Content content) {
+ public ParseResult getParse(final Content content) {
String resultText = null;
String resultTitle = null;
@@ -74,7 +74,7 @@
ParseStatus.FAILED_TRUNCATED, "Content truncated at "
+ contentInBytes.length
+ " bytes. Parser can't handle incomplete pdf file.")
- .getEmptyParse(getConf());
+ .getEmptyParseResult(content.getUrl(), getConf());
}
ZipTextExtractor extractor = new ZipTextExtractor(getConf());
@@ -85,7 +85,7 @@
} catch (Exception e) {
return new ParseStatus(ParseStatus.FAILED,
- "Can't be handled as Zip document. " + e).getEmptyParse(getConf());
+ "Can't be handled as Zip document. " + e).getEmptyParseResult(content.getUrl(), getConf());
}
if (resultText == null) {
@@ -103,7 +103,7 @@
parseData.setConf(this.conf);
if (LOG.isTraceEnabled()) { LOG.trace("Zip file parsed sucessfully !!"); }
- return new ParseImpl(resultText, parseData);
+ return ParseResult.createParseResult(content.getUrl(), new ParseImpl(resultText, parseData));
}
public void setConf(Configuration conf) {
Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Wed May 9 11:00:56 2007
@@ -38,6 +38,7 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.protocol.Content;
@@ -97,7 +98,7 @@
metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));
metadata.set(Response.CONTENT_TYPE, contentType);
Content content = new Content(newurl, base, b, contentType, metadata, this.conf);
- Parse parse = new ParseUtil(this.conf).parse(content);
+ Parse parse = new ParseUtil(this.conf).parse(content).get(content.getUrl());
ParseData theParseData = parse.getData();
Outlink[] theOutlinks = theParseData.getOutlinks();
Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?view=diff&rev=536606&r1=536605&r2=536606
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java Wed May 9 11:00:56 2007
@@ -23,6 +23,7 @@
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
import org.apache.hadoop.conf.Configuration;
@@ -70,7 +71,7 @@
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content);
+ parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content).get(content.getUrl());
assertTrue(parse.getText().equals(expectedText));
}
}
Re: svn commit: r536606 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/util/
src/plugin/creativecommons/src/test/org/creativecommons/nutch/ src/...
Posted by Andrzej Bialecki <ab...@getopt.org>.
Sami Siren wrote:
> ab@apache.org wrote:
>> Author: ab
>> Date: Wed May 9 11:00:56 2007
>> New Revision: 536606
>>
>> URL: http://svn.apache.org/viewvc?view=rev&rev=536606
>> Log:
>> NUTCH-443 - Allow parsers to return multiple Parse objects.
>
> did you forgot to add something (ParseResult) or is it just me?
Indeed. Thanks for spotting this - it's fixed.
--
Best regards,
Andrzej Bialecki <><
___. ___ ___ ___ _ _ __________________________________
[__ || __|__/|__||\/| Information Retrieval, Semantic Web
___|||__|| \| || | Embedded Unix, System Integration
http://www.sigram.com Contact: info at sigram dot com
Re: svn commit: r536606 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/util/
src/plugin/creativecommons/src/test/org/creativecommons/nutch/ src/...
Posted by Sami Siren <ss...@gmail.com>.
ab@apache.org wrote:
> Author: ab
> Date: Wed May 9 11:00:56 2007
> New Revision: 536606
>
> URL: http://svn.apache.org/viewvc?view=rev&rev=536606
> Log:
> NUTCH-443 - Allow parsers to return multiple Parse objects.
did you forgot to add something (ParseResult) or is it just me?
--
Sami Siren