You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2005/10/15 00:11:19 UTC
svn commit: r321231 - in /lucene/nutch/trunk/src:
java/org/apache/nutch/fetcher/ java/org/apache/nutch/parse/
java/org/apache/nutch/tools/
plugin/creativecommons/src/test/org/creativecommons/nutch/
plugin/parse-ext/src/test/org/apache/nutch/parse/ext/ ...
Author: jerome
Date: Fri Oct 14 15:10:45 2005
New Revision: 321231
URL: http://svn.apache.org/viewcvs?rev=321231&view=rev
Log:
NUTCH-88, Final step implementation:
* Add a parse utility that loops over the ordered list of parser defined for a content-type (until a parser return a Parse object).
* Add a parse utility that returns a Parse object using a specified parser (mainly used in unit tests).
* Make use of this utility in classes that needs to parse some content.
Added:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java (with props)
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java
lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Oct 14 15:10:45 2005
@@ -246,12 +246,10 @@
return null;
}
String contentType = content.getContentType();
- Parser parser = null;
Parse parse = null;
ParseStatus status = null;
try {
- parser = ParserFactory.getParser(contentType, url);
- parse = parser.getParse(content);
+ parse = ParseUtil.parse(content);
status = parse.getData().getStatus();
} catch (Exception e) {
e.printStackTrace();
Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java?rev=321231&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java Fri Oct 14 15:10:45 2005
@@ -0,0 +1,120 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// JDK imports
+import java.util.logging.Logger;
+
+// Nutch Imports
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.LogFormatter;
+
+
+/**
+ * A Utility class containing methods to simply perform parsing utilities such
+ * as iterating through a preferred list of {@link Parser}s to obtain
+ * {@link Parse} objects.
+ *
+ * @author mattmann
+ * @author Jérôme Charron
+ * @author Sébastien Le Callonnec
+ */
+public class ParseUtil {
+
+ /* our log stream */
+ public static final Logger LOG = LogFormatter.getLogger(ParseUtil.class
+ .getName());
+
+ /** No public constructor */
+ private ParseUtil() { }
+
+ /**
+ * Performs a parse by iterating through a List of preferred {@Parser}s
+ * until a successful parse is performed and a {@link Parse} object is
+ * returned. If the parse is unsuccessful, a message is logged to the
+ * <code>WARNING</code> level, and an empty parse is returned.
+ *
+ * @param content The content to try and parse.
+ * @return A {@link Parse} object containing the parsed data.
+ * @throws ParseException If no suitable parser is found to perform the parse.
+ */
+ public final static Parse parse(Content content) throws ParseException {
+ Parser[] parsers = null;
+
+ try {
+ parsers = ParserFactory.getParsers(content.getContentType(), "");
+ } catch (ParserNotFound e) {
+ LOG.warning("No suitable parser found when trying to parse content " +
+ content);
+ throw new ParseException(e.getMessage());
+ }
+
+ Parse parse = null;
+ for (int i=0; i<parsers.length; i++) {
+ parse = parsers[i].getParse(content);
+ if ((parse != null) && (parse.getData().getStatus().isSuccess())) {
+ return parse;
+ }
+ }
+
+ LOG.warning("Unable to successfully parse content " + content.getUrl() +
+ " of type " + content.getContentType());
+
+ return new ParseStatus().getEmptyParse();
+ }
+
+ /**
+ * Method parses a {@link Content} object using the {@link Parser} specified
+ * by the parameter <code>parserId</code>. If a suitable {@link Parser} is not
+ * found, then a <code>WARNING</code> level message is logged, and a
+ * ParseException is thrown.
+ * If the parse is uncessful for any other reason, then a <code>WARNING</code>
+ * level message is logged, and a <code>ParseStatus.getEmptyParse() is
+ * returned.
+ *
+ * @param parserId The ID of the {@link Parser} to use to parse the specified
+ * content.
+ * @param content The content to parse.
+ * @return A {@link Parse} object if the parse is successful, otherwise,
+ * a <code>ParseStatus.getEmptyParse()</code>.
+ * @throws ParseException If there is no suitable {@link Parser} found
+ * to perform the parse.
+ */
+ public final static Parse parseByParserId(String parserId, Content content)
+ throws ParseException {
+ Parse parse = null;
+ Parser p = null;
+
+ try {
+ p = ParserFactory.getParserById(parserId);
+ } catch (ParserNotFound e) {
+ LOG.warning("No suitable parser found when trying to parse content " +
+ content);
+ throw new ParseException(e.getMessage());
+ }
+
+ parse = p.getParse(content);
+
+ if (parse != null && parse.getData().getStatus().isSuccess()) {
+ return parse;
+ } else {
+ LOG.warning("Unable to successfully parse content " + content.getUrl() +
+ " of type " + content.getContentType());
+ return new ParseStatus().getEmptyParse();
+ }
+ }
+
+}
Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Fri Oct 14 15:10:45 2005
@@ -18,6 +18,8 @@
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.parse.ParseUtil;
+
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.Content;
@@ -83,8 +85,7 @@
LOG.info("parsing: "+url);
LOG.info("contentType: "+contentType);
- Parser parser = ParserFactory.getParser(contentType, url);
- Parse parse = parser.getParse(content);
+ Parse parse = ParseUtil.parse(content);
System.out.print("---------\nParseData\n---------\n");
System.out.print(parse.getData().toString());
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Fri Oct 14 15:10:45 2005
@@ -174,6 +174,54 @@
}
/**
+ * <p>Function returns a {@link Parser} instance with the specified <code>parserId</code>.
+ * If the Parser instance isn't found, then the function throws a <code>ParserNotFound</code>
+ * exception. If the function is able to find the {@link Parser} in the internal <code>PARSER_CACHE</code>
+ * then it will return the already instantiated Parser. Otherwise, if it has to instantiate the Parser itself
+ * , then this function will cache that Parser in the internal <code>PARSER_CACHE</code>.
+ *
+ * @param parserId The string ID (e.g., "parse-text", "parse-msword") of the {@link Parser} implementation to return.
+ * @return A {@link Parser} implementation specified by the parameter <code>parserId</code>.
+ * @throws ParserNotFound If the Parser is not found (i.e., registered with the extension point), or if the there a {@link PluginRuntimeException}
+ * instantiating the {@link Parser}.
+ */
+ public static Parser getParserById(String parserId) throws ParserNotFound{
+ //first check the cache
+
+ if(PARSER_CACHE.get(parserId) != null){
+ return (Parser)PARSER_CACHE.get(parserId);
+ }
+ else{
+ //get the list of registered parsing extensions
+ //then find the right one by Id
+
+ Extension[] extensions = X_POINT.getExtensions();
+ Extension parserExt = getExtensionById(extensions,parserId);
+
+ if (parserExt == null) {
+ throw new ParserNotFound("No Parser Found for parserId: "
+ + parserId + "!");
+ } else {
+ // instantiate the Parser
+ try {
+ Parser p = null;
+ p = (Parser) parserExt.getExtensionInstance();
+ PARSER_CACHE
+ .put(parserId, p);
+ return p;
+ } catch (PluginRuntimeException e) {
+ LOG.warning("ParserFactory:PluginRuntimeException when "
+ + "initializing parser plugin "
+ + parserExt.getDescriptor().getPluginId()
+ + " instance in getParserById");
+ throw new ParserNotFound("No Parser Found for parserId: "
+ + parserId + "!");
+ }
+ }
+ }
+ }
+
+ /**
* finds the best-suited parse plugin for a given contentType.
*
* @param contentType Content-Type for which we seek a parse plugin.
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java Fri Oct 14 15:10:45 2005
@@ -13,15 +13,18 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.apache.nutch.parse;
-import java.io.IOException;
-
public class ParserNotFound extends ParseException {
+
+ private static final long serialVersionUID=23993993939L;
private String url;
private String contentType;
+ public ParserNotFound(String message){
+ super(message);
+ }
+
public ParserNotFound(String url, String contentType) {
this(url, contentType,
"parser not found for contentType="+contentType+" url="+url);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java Fri Oct 14 15:10:45 2005
@@ -230,8 +230,7 @@
return;
}
- Parser parser = ParserFactory.getParser(contentType, url);
- Parse parse = parser.getParse(content);
+ Parse parse = ParseUtil.parse(content);
outputPage(new ParseText(parse.getText()), parse.getData());
} else {
@@ -585,7 +584,7 @@
parseSegment.setLogLevel
(Level.parse((new String(logLevel)).toUpperCase()));
-
+
if (threadCount != -1)
parseSegment.setThreadCount(threadCount);
if (showThreadID)
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Fri Oct 14 15:10:45 2005
@@ -16,7 +16,8 @@
package org.creativecommons.nutch;
-import org.apache.nutch.parse.*;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import java.util.Properties;
@@ -54,10 +55,9 @@
in.close();
byte[] bytes = out.toByteArray();
- Parser parser = ParserFactory.getParser(contentType, url);
Content content =
new Content(url, url, bytes, contentType, new Properties());
- Parse parse = parser.getParse(content);
+ Parse parse = ParseUtil.parseByParserId("parse-html",content);
Properties metadata = parse.getData().getMetadata();
assertEquals(license, metadata.get("License-Url"));
Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java Fri Oct 14 15:10:45 2005
@@ -21,8 +21,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.util.ParseUtil;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseException;
@@ -46,8 +45,7 @@
public class TestExtParser extends TestCase {
private File tempFile = null;
private String urlString = null;
- private Content content = null;;
- private Parser parser = null;;
+ private Content content = null;
private Parse parse = null;
private String expectedText = "nutch rocks nutch rocks nutch rocks";
@@ -107,15 +105,13 @@
// check external parser that does 'cat'
contentType = "application/vnd.nutch.example.cat";
content.setContentType(contentType);
- parser = ParserFactory.getParser(contentType, urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parse(content);
assertEquals(expectedText,parse.getText());
// check external parser that does 'md5sum'
contentType = "application/vnd.nutch.example.md5sum";
content.setContentType(contentType);
- parser = ParserFactory.getParser(contentType, urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parse(content);
assertTrue(parse.getText().startsWith(expectedMD5sum));
}
}
Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java Fri Oct 14 15:10:45 2005
@@ -19,7 +19,7 @@
import junit.framework.TestCase;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
@@ -60,15 +60,13 @@
String urlString;
Protocol protocol;
Content content;
- Parser parser;
Parse parse;
urlString = "file:" + sampleDir + fileSeparator + id3v2;
protocol = ProtocolFactory.getProtocol(urlString);
content = protocol.getContent(urlString);
- parser = ParserFactory.getParser(content.getContentType(), urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parseByParserId("parse-mp3",content);
Properties metadata = parse.getData().getMetadata();
assertEquals("postgresql comment id3v2", metadata.getProperty("COMM-Text"));
assertEquals("postgresql composer id3v2", metadata.getProperty("TCOM-Text"));
Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java Fri Oct 14 15:10:45 2005
@@ -29,8 +29,7 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.util.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
@@ -123,9 +122,7 @@
*/
public void testContent() throws Exception {
- Parser parser = ParserFactory.getParser(this.content.getContentType(),
- this.urlString);
- Parse parse = parser.getParse(this.content);
+ Parse parse = ParseUtil.parseByParserId("parse-mspowerpoint",this.content);
ParseData data = parse.getData();
String text = parse.getText();
@@ -162,10 +159,8 @@
*/
public void testMeta() throws Exception {
- Parser parser = ParserFactory.getParser(this.content.getContentType(),
- this.urlString);
- Parse parse = parser.getParse(this.content);
-
+ Parse parse = ParseUtil.parseByParserId("parse-mspowerpoint",content);
+
ParseData data = parse.getData();
final FileExtensionFilter titleFilter = new FileExtensionFilter(
Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Fri Oct 14 15:10:45 2005
@@ -21,8 +21,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.util.ParseUtil;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseException;
@@ -65,9 +64,7 @@
protocol = ProtocolFactory.getProtocol(urlString);
content = protocol.getProtocolOutput(urlString).getContent();
-
- parser = ParserFactory.getParser(content.getContentType(), urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parseByParserId("parse-msword",content);
assertTrue(parse.getText().startsWith(expectedText));
}
Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java Fri Oct 14 15:10:45 2005
@@ -21,8 +21,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.util.ParseUtil;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseException;
@@ -65,9 +64,7 @@
protocol = ProtocolFactory.getProtocol(urlString);
content = protocol.getProtocolOutput(urlString).getContent();
-
- parser = ParserFactory.getParser(content.getContentType(), urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parseByParserId("parse-pdf",content);
int index = parse.getText().indexOf(expectedText);
assertTrue(index > 0);
Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java Fri Oct 14 15:10:45 2005
@@ -21,8 +21,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.util.ParseUtil;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParseData;
@@ -75,7 +74,6 @@
String urlString;
Protocol protocol;
Content content;
- Parser parser;
Parse parse;
for (int i = 0; i < sampleFiles.length; i++) {
@@ -83,10 +81,7 @@
protocol = ProtocolFactory.getProtocol(urlString);
content = protocol.getProtocolOutput(urlString).getContent();
-
- parser = ParserFactory.getParser(content.getContentType(),
- urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parseByParserId("parse-rss",content);
//check that there are 3 outlinks:
//http://test.channel.com
Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Fri Oct 14 15:10:45 2005
@@ -18,8 +18,8 @@
import junit.framework.TestCase;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.util.ParseUtil;
import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
@@ -58,15 +58,13 @@
String urlString;
Protocol protocol;
Content content;
- Parser parser;
Parse parse;
urlString = "file:" + sampleDir + fileSeparator + rtfFile;
protocol = ProtocolFactory.getProtocol(urlString);
content = protocol.getContent(urlString);
- parser = ParserFactory.getParser(content.getContentType(), urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parseByParserId("parse-rtf",content);
String text = parse.getText();
assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Fri Oct 14 15:10:45 2005
@@ -28,9 +28,8 @@
// Nutch imports
import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.protocol.Content;
@@ -89,8 +88,7 @@
metadata.setProperty("Content-Length", Long.toString(entry.getSize()));
metadata.setProperty("Content-Type", contentType);
Content content = new Content(newurl, base, b, contentType, metadata);
- Parser parser = ParserFactory.getParser(contentType, newurl);
- Parse parse = parser.getParse(content);
+ Parse parse = ParseUtil.parse(content);
ParseData theParseData = parse.getData();
Outlink[] theOutlinks = theParseData.getOutlinks();
Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java Fri Oct 14 15:10:45 2005
@@ -21,8 +21,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.util.ParseUtil;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseException;
@@ -65,9 +64,7 @@
protocol = ProtocolFactory.getProtocol(urlString);
content = protocol.getProtocolOutput(urlString).getContent();
-
- parser = ParserFactory.getParser(content.getContentType(), urlString);
- parse = parser.getParse(content);
+ parse = ParseUtil.parseByParserId("parse-zip",content);
assertTrue(parse.getText().equals(expectedText));
}
}