You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/10/18 23:00:19 UTC
svn commit: r326238 [2/2] - in /lucene/nutch/branches/mapred: ./ conf/ site/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/db/
src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/parse/
src/java/org/apache/nutch/plugin/ src/java/o...
Modified: lucene/nutch/branches/mapred/src/plugin/index-basic/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/index-basic/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/index-basic/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/index-basic/plugin.xml Tue Oct 18 13:59:40 2005
@@ -12,6 +12,10 @@
</library>
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<extension id="org.apache.nutch.indexer.basic"
name="Nutch Basic Indexing Filter"
point="org.apache.nutch.indexer.IndexingFilter">
Modified: lucene/nutch/branches/mapred/src/plugin/index-more/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/index-more/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/index-more/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/index-more/plugin.xml Tue Oct 18 13:59:40 2005
@@ -12,6 +12,10 @@
</library>
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<extension id="org.apache.nutch.indexer.more"
name="Nutch More Indexing Filter"
point="org.apache.nutch.indexer.IndexingFilter">
Modified: lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/languageidentifier/plugin.xml Tue Oct 18 13:59:40 2005
@@ -5,13 +5,15 @@
version="1.0.0"
provider-name="nutch.org">
-
-
<runtime>
<library name="language-identifier.jar">
<export name="*"/>
</library>
</runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
<extension id="org.apache.nutch.analysis.lang.LanguageParser"
name="Nutch language Parser"
Modified: lucene/nutch/branches/mapred/src/plugin/ontology/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/ontology/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/ontology/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/ontology/plugin.xml Tue Oct 18 13:59:40 2005
@@ -18,6 +18,10 @@
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<!-- attribute "point" is the plugin interface class -->
<!-- seems kinda redundant to have to define the point here too -->
<extension id="org.apache.nutch.ontology.OntologyImpl"
Modified: lucene/nutch/branches/mapred/src/plugin/parse-ext/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-ext/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-ext/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-ext/plugin.xml Tue Oct 18 13:59:40 2005
@@ -5,13 +5,15 @@
version="1.0.0"
provider-name="nutch.org">
-
-
<runtime>
<library name="parse-ext.jar">
<export name="*"/>
</library>
</runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
<extension id="org.apache.nutch.parse.ext"
name="ExtParse"
Modified: lucene/nutch/branches/mapred/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java Tue Oct 18 13:59:40 2005
@@ -21,9 +21,8 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
import junit.framework.TestCase;
@@ -46,8 +45,7 @@
public class TestExtParser extends TestCase {
private File tempFile = null;
private String urlString = null;
- private Content content = null;;
- private Parser parser = null;;
+ private Content content = null;
private Parse parse = null;
private String expectedText = "nutch rocks nutch rocks nutch rocks";
@@ -107,15 +105,13 @@
// check external parser that does 'cat'
contentType = "application/vnd.nutch.example.cat";
content.setContentType(contentType);
- parser = ParserFactory.getParser(contentType, urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parseByParserId("parse-ext", content);
assertEquals(expectedText,parse.getText());
// check external parser that does 'md5sum'
contentType = "application/vnd.nutch.example.md5sum";
content.setContentType(contentType);
- parser = ParserFactory.getParser(contentType, urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parseByParserId("parse-ext", content);
assertTrue(parse.getText().startsWith(expectedMD5sum));
}
}
Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-html/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-html/plugin.xml Tue Oct 18 13:59:40 2005
@@ -5,8 +5,6 @@
version="1.0.0"
provider-name="nutch.org">
-
-
<runtime>
<library name="parse-html.jar">
<export name="*"/>
@@ -14,6 +12,10 @@
<library name="nekohtml-0.9.4.jar"/>
<library name="tagsoup-1.0rc3.jar"/>
</runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
<extension id="org.apache.nutch.parse.html"
name="HtmlParse"
Modified: lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml Tue Oct 18 13:59:40 2005
@@ -11,6 +11,10 @@
</library>
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<extension id="org.apache.nutch.parse.js"
name="JS Parser"
point="org.apache.nutch.parse.Parser">
Modified: lucene/nutch/branches/mapred/src/plugin/parse-mp3/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-mp3/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-mp3/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-mp3/plugin.xml Tue Oct 18 13:59:40 2005
@@ -12,6 +12,10 @@
<library name="jid3lib-0.5.1.jar"/>
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<extension point="org.apache.nutch.parse.Parser"
id="org.apache.nutch.parse.mp3"
name="MP3Parse">
Modified: lucene/nutch/branches/mapred/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java Tue Oct 18 13:59:40 2005
@@ -19,7 +19,7 @@
import junit.framework.TestCase;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
@@ -60,15 +60,13 @@
String urlString;
Protocol protocol;
Content content;
- Parser parser;
Parse parse;
urlString = "file:" + sampleDir + fileSeparator + id3v2;
protocol = ProtocolFactory.getProtocol(urlString);
content = protocol.getContent(urlString);
- parser = ParserFactory.getParser(content.getContentType(), urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parseByParserId("parse-mp3",content);
Properties metadata = parse.getData().getMetadata();
assertEquals("postgresql comment id3v2", metadata.getProperty("COMM-Text"));
assertEquals("postgresql composer id3v2", metadata.getProperty("TCOM-Text"));
Modified: lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/plugin.xml Tue Oct 18 13:59:40 2005
@@ -13,6 +13,7 @@
<requires>
<import plugin="lib-jakarta-poi"/>
+ <import plugin="nutch-extensionpoints"/>
</requires>
<extension id="net.nutch.parse.mspowerpoint"
Modified: lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java Tue Oct 18 13:59:40 2005
@@ -29,8 +29,7 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
@@ -123,9 +122,7 @@
*/
public void testContent() throws Exception {
- Parser parser = ParserFactory.getParser(this.content.getContentType(),
- this.urlString);
- Parse parse = parser.getParse(this.content);
+ Parse parse = ParseUtil.parseByParserId("parse-mspowerpoint",this.content);
ParseData data = parse.getData();
String text = parse.getText();
@@ -162,10 +159,8 @@
*/
public void testMeta() throws Exception {
- Parser parser = ParserFactory.getParser(this.content.getContentType(),
- this.urlString);
- Parse parse = parser.getParse(this.content);
-
+ Parse parse = ParseUtil.parseByParserId("parse-mspowerpoint",content);
+
ParseData data = parse.getData();
final FileExtensionFilter titleFilter = new FileExtensionFilter(
Modified: lucene/nutch/branches/mapred/src/plugin/parse-msword/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-msword/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-msword/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-msword/plugin.xml Tue Oct 18 13:59:40 2005
@@ -13,6 +13,10 @@
<library name="poi-scratchpad-2.1-20040508.jar"/>
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<extension id="org.apache.nutch.parse.msword"
name="MSWordParse"
point="org.apache.nutch.parse.Parser">
Modified: lucene/nutch/branches/mapred/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Tue Oct 18 13:59:40 2005
@@ -21,9 +21,8 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
import junit.framework.TestCase;
@@ -57,7 +56,6 @@
String urlString;
Protocol protocol;
Content content;
- Parser parser;
Parse parse;
for (int i=0; i<sampleFiles.length; i++) {
@@ -65,9 +63,7 @@
protocol = ProtocolFactory.getProtocol(urlString);
content = protocol.getProtocolOutput(urlString).getContent();
-
- parser = ParserFactory.getParser(content.getContentType(), urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parseByParserId("parse-msword",content);
assertTrue(parse.getText().startsWith(expectedText));
}
Modified: lucene/nutch/branches/mapred/src/plugin/parse-pdf/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-pdf/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-pdf/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-pdf/plugin.xml Tue Oct 18 13:59:40 2005
@@ -10,9 +10,13 @@
<library name="parse-pdf.jar">
<export name="*"/>
</library>
- <library name="PDFBox-0.7.0.jar"/>
+ <library name="PDFBox-0.7.2-log4j.jar"/>
<library name="log4j-1.2.9.jar"/>
</runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
<extension id="org.apache.nutch.parse.pdf"
name="PdfParse"
Modified: lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java Tue Oct 18 13:59:40 2005
@@ -21,9 +21,8 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
import junit.framework.TestCase;
@@ -57,7 +56,6 @@
String urlString;
Protocol protocol;
Content content;
- Parser parser;
Parse parse;
for (int i=0; i<sampleFiles.length; i++) {
@@ -65,9 +63,7 @@
protocol = ProtocolFactory.getProtocol(urlString);
content = protocol.getProtocolOutput(urlString).getContent();
-
- parser = ParserFactory.getParser(content.getContentType(), urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parseByParserId("parse-pdf",content);
int index = parse.getText().indexOf(expectedText);
assertTrue(index > 0);
Modified: lucene/nutch/branches/mapred/src/plugin/parse-rss/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-rss/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-rss/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-rss/plugin.xml Tue Oct 18 13:59:40 2005
@@ -20,8 +20,11 @@
<library name="xercesImpl.jar"/>
<library name="xml-apis.jar"/>
<library name="xml-rpc-1.2.jar"/>
-
</runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
<extension id="org.apache.nutch.parse.rss"
name="RssParse"
Modified: lucene/nutch/branches/mapred/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java Tue Oct 18 13:59:40 2005
@@ -157,11 +157,13 @@
if (r.getLink() != null) {
try {
// get the outlink
- theOutlinks.add(new Outlink(r.getLink(), r
- .getDescription()));
+ if (r.getDescription()!= null ) {
+ theOutlinks.add(new Outlink(r.getLink(), r.getDescription()));
+ } else {
+ theOutlinks.add(new Outlink(r.getLink(), ""));
+ }
} catch (MalformedURLException e) {
- LOG
- .info("nutch:parse-rss:RSSParser Exception: MalformedURL: "
+ LOG.info("nutch:parse-rss:RSSParser Exception: MalformedURL: "
+ r.getLink()
+ ": Attempting to continue processing outlinks");
e.printStackTrace();
@@ -185,12 +187,13 @@
if (whichLink != null) {
try {
- theOutlinks.add(new Outlink(whichLink, theRSSItem
- .getDescription()));
-
+ if (theRSSItem.getDescription()!=null) {
+ theOutlinks.add(new Outlink(whichLink, theRSSItem.getDescription()));
+ } else {
+ theOutlinks.add(new Outlink(whichLink, ""));
+ }
} catch (MalformedURLException e) {
- LOG
- .info("nutch:parse-rss:RSSParser Exception: MalformedURL: "
+ LOG.info("nutch:parse-rss:RSSParser Exception: MalformedURL: "
+ whichLink
+ ": Attempting to continue processing outlinks");
e.printStackTrace();
@@ -206,23 +209,18 @@
LOG.fine("nutch:parse-rss:getParse:contentTitle=" + contentTitle);
} else {
- LOG
- .fine("nutch:parse-rss:Error:getParse: No RSS Channels recorded!");
+ LOG.fine("nutch:parse-rss:Error:getParse: No RSS Channels recorded!");
}
// format the outlinks
+ Outlink[] outlinks = (Outlink[]) theOutlinks.toArray(new Outlink[theOutlinks.size()]);
- Outlink[] outlinks = (Outlink[]) theOutlinks
- .toArray(new Outlink[theOutlinks.size()]);
-
- LOG.fine("nutch:parse-rss:getParse:found " + outlinks.length
- + " outlinks");
+ LOG.fine("nutch:parse-rss:getParse:found " + outlinks.length + " outlinks");
// LOG.info("Outlinks: "+outlinks);
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
contentTitle.toString(), outlinks, content.getMetadata());
return new ParseImpl(indexText.toString(), parseData);
-
}
}
Modified: lucene/nutch/branches/mapred/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java Tue Oct 18 13:59:40 2005
@@ -21,9 +21,8 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.Outlink;
@@ -75,7 +74,6 @@
String urlString;
Protocol protocol;
Content content;
- Parser parser;
Parse parse;
for (int i = 0; i < sampleFiles.length; i++) {
@@ -83,10 +81,7 @@
protocol = ProtocolFactory.getProtocol(urlString);
content = protocol.getProtocolOutput(urlString).getContent();
-
- parser = ParserFactory.getParser(content.getContentType(),
- urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parseByParserId("parse-rss",content);
//check that there are 3 outlinks:
//http://test.channel.com
Modified: lucene/nutch/branches/mapred/src/plugin/parse-rtf/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-rtf/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-rtf/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-rtf/plugin.xml Tue Oct 18 13:59:40 2005
@@ -12,6 +12,10 @@
<library name="rtf-parser.jar"/>
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<extension point="org.apache.nutch.parse.Parser"
id="org.apache.nutch.parse.rtf"
name="RTFParse">
Modified: lucene/nutch/branches/mapred/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Tue Oct 18 13:59:40 2005
@@ -18,8 +18,8 @@
import junit.framework.TestCase;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
@@ -58,15 +58,13 @@
String urlString;
Protocol protocol;
Content content;
- Parser parser;
Parse parse;
urlString = "file:" + sampleDir + fileSeparator + rtfFile;
protocol = ProtocolFactory.getProtocol(urlString);
content = protocol.getContent(urlString);
- parser = ParserFactory.getParser(content.getContentType(), urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parseByParserId("parse-rtf",content);
String text = parse.getText();
assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
Modified: lucene/nutch/branches/mapred/src/plugin/parse-text/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-text/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-text/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-text/plugin.xml Tue Oct 18 13:59:40 2005
@@ -12,6 +12,10 @@
</library>
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<extension id="org.apache.nutch.parse.text"
name="TextParse"
point="org.apache.nutch.parse.Parser">
Modified: lucene/nutch/branches/mapred/src/plugin/parse-zip/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-zip/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-zip/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-zip/plugin.xml Tue Oct 18 13:59:40 2005
@@ -11,6 +11,10 @@
</library>
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<extension id="org.apache.nutch.parse.zip"
name="ZipParser"
point="org.apache.nutch.parse.Parser">
Modified: lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Tue Oct 18 13:59:40 2005
@@ -28,9 +28,8 @@
// Nutch imports
import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.protocol.Content;
@@ -89,8 +88,7 @@
metadata.setProperty("Content-Length", Long.toString(entry.getSize()));
metadata.setProperty("Content-Type", contentType);
Content content = new Content(newurl, base, b, contentType, metadata);
- Parser parser = ParserFactory.getParser(contentType, newurl);
- Parse parse = parser.getParse(content);
+ Parse parse = ParseUtil.parse(content);
ParseData theParseData = parse.getData();
Outlink[] theOutlinks = theParseData.getOutlinks();
Modified: lucene/nutch/branches/mapred/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java Tue Oct 18 13:59:40 2005
@@ -21,9 +21,8 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
import junit.framework.TestCase;
@@ -57,7 +56,6 @@
String urlString;
Protocol protocol;
Content content;
- Parser parser;
Parse parse;
for (int i = 0; i < sampleFiles.length; i++) {
@@ -65,9 +63,7 @@
protocol = ProtocolFactory.getProtocol(urlString);
content = protocol.getProtocolOutput(urlString).getContent();
-
- parser = ParserFactory.getParser(content.getContentType(), urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parseByParserId("parse-zip",content);
assertTrue(parse.getText().equals(expectedText));
}
}
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-file/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-file/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-file/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-file/plugin.xml Tue Oct 18 13:59:40 2005
@@ -12,6 +12,10 @@
</library>
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<extension id="org.apache.nutch.protocol.file"
name="FileProtocol"
point="org.apache.nutch.protocol.Protocol">
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-ftp/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-ftp/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-ftp/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-ftp/plugin.xml Tue Oct 18 13:59:40 2005
@@ -5,14 +5,16 @@
version="1.0.0"
provider-name="nutch.org">
-
-
<runtime>
<library name="protocol-ftp.jar">
<export name="*"/>
</library>
<library name="commons-net-1.2.0-dev.jar"/>
</runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
<extension id="org.apache.nutch.protocol.ftp"
name="FtpProtocol"
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-http/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-http/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-http/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-http/plugin.xml Tue Oct 18 13:59:40 2005
@@ -11,6 +11,10 @@
</library>
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<extension id="org.apache.nutch.protocol.http"
name="HttpProtocol"
point="org.apache.nutch.protocol.Protocol">
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/plugin.xml Tue Oct 18 13:59:40 2005
@@ -13,6 +13,10 @@
<library name="commons-httpclient-3.0-rc2.jar" />
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<extension id="org.apache.nutch.protocol.httpclient"
name="HttpProtocol"
point="org.apache.nutch.protocol.Protocol">
Modified: lucene/nutch/branches/mapred/src/plugin/query-basic/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/query-basic/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/query-basic/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/query-basic/plugin.xml Tue Oct 18 13:59:40 2005
@@ -5,13 +5,15 @@
version="1.0.0"
provider-name="nutch.org">
-
-
<runtime>
<library name="query-basic.jar">
<export name="*"/>
</library>
</runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
<extension id="org.apache.nutch.searcher.basic"
name="Nutch Basic Query Filter"
Modified: lucene/nutch/branches/mapred/src/plugin/query-more/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/query-more/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/query-more/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/query-more/plugin.xml Tue Oct 18 13:59:40 2005
@@ -5,13 +5,15 @@
version="1.0.0"
provider-name="nutch.org">
-
-
<runtime>
<library name="query-more.jar">
<export name="*"/>
</library>
</runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
<extension id="org.apache.nutch.searcher.more"
name="Nutch More Query Filter"
Modified: lucene/nutch/branches/mapred/src/plugin/query-site/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/query-site/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/query-site/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/query-site/plugin.xml Tue Oct 18 13:59:40 2005
@@ -11,6 +11,10 @@
</library>
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<extension id="org.apache.nutch.searcher.site.SiteQueryFilter"
name="Nutch Site Query Filter"
point="org.apache.nutch.searcher.QueryFilter">
Modified: lucene/nutch/branches/mapred/src/plugin/query-url/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/query-url/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/query-url/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/query-url/plugin.xml Tue Oct 18 13:59:40 2005
@@ -5,14 +5,15 @@
version="1.0.0"
provider-name="nutch.org">
-
-
<runtime>
<library name="query-url.jar">
<export name="*"/>
</library>
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
<extension id="org.apache.nutch.searcher.url.URLQueryFilter"
name="Nutch URL Query Filter"
Modified: lucene/nutch/branches/mapred/src/plugin/urlfilter-prefix/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/urlfilter-prefix/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/urlfilter-prefix/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/urlfilter-prefix/plugin.xml Tue Oct 18 13:59:40 2005
@@ -5,15 +5,17 @@
version="1.0.0"
provider-name="nutch.org">
-
-
<runtime>
<library name="urlfilter-prefix.jar">
<export name="*"/>
</library>
</runtime>
- <extension id="org.apache.nutch.net.urlfiler"
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlfilter"
name="Nutch Prefix URL Filter"
point="org.apache.nutch.net.URLFilter">
<implementation id="PrefixURLFilter"
Modified: lucene/nutch/branches/mapred/src/plugin/urlfilter-regex/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/urlfilter-regex/plugin.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/urlfilter-regex/plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/urlfilter-regex/plugin.xml Tue Oct 18 13:59:40 2005
@@ -11,7 +11,11 @@
</library>
</runtime>
- <extension id="org.apache.nutch.net.urlfiler"
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.net.urlfilter"
name="Nutch Regex URL Filter"
point="org.apache.nutch.net.URLFilter">
<implementation id="RegexURLFilter"
Modified: lucene/nutch/branches/mapred/src/site/src/documentation/content/xdocs/site.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/site/src/documentation/content/xdocs/site.xml?rev=326238&r1=326237&r2=326238&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/site/src/documentation/content/xdocs/site.xml (original)
+++ lucene/nutch/branches/mapred/src/site/src/documentation/content/xdocs/site.xml Tue Oct 18 13:59:40 2005
@@ -24,7 +24,7 @@
</project>
<docs label="Documentation">
- <faq label="FAQ" href="faq.html" />
+ <faq label="FAQ" href="ext:faq" />
<wiki label="Wiki" href="ext:wiki" />
<tutorial label="Tutorial" href="tutorial.html" />
<webmasters label="Robot " href="bot.html" />
@@ -46,6 +46,7 @@
<external-refs>
<lucene href="http://lucene.apache.org/java/" />
<wiki href="http://wiki.apache.org/nutch/" />
+ <faq href="http://wiki.apache.org/nutch/FAQ" />
<store href="http://www.cafepress.com/nutch/" />
</external-refs>