You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2005/10/15 00:11:19 UTC

svn commit: r321231 - in /lucene/nutch/trunk/src: java/org/apache/nutch/fetcher/ java/org/apache/nutch/parse/ java/org/apache/nutch/tools/ plugin/creativecommons/src/test/org/creativecommons/nutch/ plugin/parse-ext/src/test/org/apache/nutch/parse/ext/ ...

Author: jerome
Date: Fri Oct 14 15:10:45 2005
New Revision: 321231

URL: http://svn.apache.org/viewcvs?rev=321231&view=rev
Log:
NUTCH-88, Final step implementation:
* Add a parse utility that loops over the ordered list of parser defined for a content-type (until a parser return a Parse object).
* Add a parse utility that returns a Parse object using a specified parser (mainly used in unit tests).
* Make use of this utility in classes that needs to parse some content.

Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java   (with props)
Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java
    lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java
    lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
    lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
    lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
    lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
    lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
    lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
    lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
    lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
    lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
    lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Oct 14 15:10:45 2005
@@ -246,12 +246,10 @@
         return null;
       }
       String contentType = content.getContentType();
-      Parser parser = null;
       Parse parse = null;
       ParseStatus status = null;
       try {
-        parser = ParserFactory.getParser(contentType, url);
-        parse = parser.getParse(content);
+        parse = ParseUtil.parse(content);
         status = parse.getData().getStatus();
       } catch (Exception e) {
         e.printStackTrace();

Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java?rev=321231&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java Fri Oct 14 15:10:45 2005
@@ -0,0 +1,120 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// JDK imports
+import java.util.logging.Logger;
+
+// Nutch Imports
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.LogFormatter;
+
+
+/**
+ * A Utility class containing methods to simply perform parsing utilities such
+ * as iterating through a preferred list of {@link Parser}s to obtain
+ * {@link Parse} objects.
+ *
+ * @author mattmann
+ * @author Jérôme Charron
+ * @author Sébastien Le Callonnec
+ */
+public class ParseUtil {
+  
+  /* our log stream */
+  public static final Logger LOG = LogFormatter.getLogger(ParseUtil.class
+          .getName());
+  
+  /** No public constructor */
+  private ParseUtil() { }
+  
+  /**
+   * Performs a parse by iterating through a List of preferred {@Parser}s
+   * until a successful parse is performed and a {@link Parse} object is
+   * returned. If the parse is unsuccessful, a message is logged to the
+   * <code>WARNING</code> level, and an empty parse is returned.
+   *
+   * @param content The content to try and parse.
+   * @return A {@link Parse} object containing the parsed data.
+   * @throws ParseException If no suitable parser is found to perform the parse.
+   */
+  public final static Parse parse(Content content) throws ParseException {
+    Parser[] parsers = null;
+    
+    try {
+      parsers = ParserFactory.getParsers(content.getContentType(), "");
+    } catch (ParserNotFound e) {
+      LOG.warning("No suitable parser found when trying to parse content " +
+                  content);
+      throw new ParseException(e.getMessage());
+    }
+    
+    Parse parse = null;
+    for (int i=0; i<parsers.length; i++) {
+      parse = parsers[i].getParse(content);
+      if ((parse != null) && (parse.getData().getStatus().isSuccess())) {
+        return parse;
+      }
+    }
+    
+    LOG.warning("Unable to successfully parse content " + content.getUrl() +
+                " of type " + content.getContentType());
+
+    return new ParseStatus().getEmptyParse();
+  }
+  
+  /**
+   * Method parses a {@link Content} object using the {@link Parser} specified
+   * by the parameter <code>parserId</code>. If a suitable {@link Parser} is not
+   * found, then a <code>WARNING</code> level message is logged, and a
+   * ParseException is thrown.
+   * If the parse is uncessful for any other reason, then a <code>WARNING</code>
+   * level message is logged, and a <code>ParseStatus.getEmptyParse() is
+   * returned.
+   *
+   * @param parserId The ID of the {@link Parser} to use to parse the specified
+   *                 content.
+   * @param content The content to parse.
+   * @return A {@link Parse} object if the parse is successful, otherwise,
+   *         a <code>ParseStatus.getEmptyParse()</code>.
+   * @throws ParseException If there is no suitable {@link Parser} found
+   *                        to perform the parse.
+   */
+  public final static Parse parseByParserId(String parserId, Content content)
+  throws ParseException {
+    Parse parse = null;
+    Parser p = null;
+    
+    try {
+      p = ParserFactory.getParserById(parserId);
+    } catch (ParserNotFound e) {
+      LOG.warning("No suitable parser found when trying to parse content " +
+                  content);
+      throw new ParseException(e.getMessage());
+    }
+    
+    parse = p.getParse(content);
+    
+    if (parse != null && parse.getData().getStatus().isSuccess()) {
+      return parse;
+    } else {
+      LOG.warning("Unable to successfully parse content " + content.getUrl() +
+                  " of type " + content.getContentType());
+      return new ParseStatus().getEmptyParse();
+    }
+  }
+  
+}

Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Fri Oct 14 15:10:45 2005
@@ -18,6 +18,8 @@
 
 import org.apache.nutch.util.LogFormatter;
 
+import org.apache.nutch.parse.ParseUtil;
+
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.Content;
@@ -83,8 +85,7 @@
     LOG.info("parsing: "+url);
     LOG.info("contentType: "+contentType);
 
-    Parser parser = ParserFactory.getParser(contentType, url);
-    Parse parse = parser.getParse(content);
+    Parse parse = ParseUtil.parse(content);
 
     System.out.print("---------\nParseData\n---------\n");
     System.out.print(parse.getData().toString());

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Fri Oct 14 15:10:45 2005
@@ -174,6 +174,54 @@
   }
   
   /**
+   * <p>Function returns a {@link Parser} instance with the specified <code>parserId</code>.
+   * If the Parser instance isn't found, then the function throws a <code>ParserNotFound</code>
+   * exception. If the function is able to find the {@link Parser} in the internal <code>PARSER_CACHE</code>
+   * then it will return the already instantiated Parser. Otherwise, if it has to instantiate the Parser itself
+   * , then this function will cache that Parser in the internal <code>PARSER_CACHE</code>.
+   * 
+   * @param parserId The string ID (e.g., "parse-text", "parse-msword") of the {@link Parser} implementation to return.
+   * @return A {@link Parser} implementation specified by the parameter <code>parserId</code>.
+   * @throws ParserNotFound If the Parser is not found (i.e., registered with the extension point), or if the there a {@link PluginRuntimeException}
+   * instantiating the {@link Parser}.
+   */
+  public static Parser getParserById(String parserId) throws ParserNotFound{
+      //first check the cache
+      
+      if(PARSER_CACHE.get(parserId) != null){
+          return (Parser)PARSER_CACHE.get(parserId);
+      }
+      else{
+          //get the list of registered parsing extensions
+          //then find the right one by Id
+          
+          Extension[] extensions = X_POINT.getExtensions();
+          Extension parserExt = getExtensionById(extensions,parserId);
+          
+          if (parserExt == null) {
+                throw new ParserNotFound("No Parser Found for parserId: "
+                        + parserId + "!");
+            } else {
+                // instantiate the Parser
+                try {
+                    Parser p = null;
+                    p = (Parser) parserExt.getExtensionInstance();
+                    PARSER_CACHE
+                            .put(parserId, p);
+                    return p;
+                } catch (PluginRuntimeException e) {
+                    LOG.warning("ParserFactory:PluginRuntimeException when "
+                            + "initializing parser plugin "
+                            + parserExt.getDescriptor().getPluginId()
+                            + " instance in getParserById");
+                    throw new ParserNotFound("No Parser Found for parserId: "
+                            + parserId + "!");
+                }
+            }
+      }
+  }
+  
+  /**
    * finds the best-suited parse plugin for a given contentType.
    *
    * @param contentType Content-Type for which we seek a parse plugin.

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java Fri Oct 14 15:10:45 2005
@@ -13,15 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
-import java.io.IOException;
-
 public class ParserNotFound extends ParseException {
+
+  private static final long serialVersionUID=23993993939L;
   private String url;
   private String contentType;
 
+  public ParserNotFound(String message){
+    super(message);    
+  }
+  
   public ParserNotFound(String url, String contentType) {
     this(url, contentType,
          "parser not found for contentType="+contentType+" url="+url);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java Fri Oct 14 15:10:45 2005
@@ -230,8 +230,7 @@
           return;
         }
 
-        Parser parser = ParserFactory.getParser(contentType, url);
-        Parse parse = parser.getParse(content);
+        Parse parse = ParseUtil.parse(content);
         outputPage(new ParseText(parse.getText()), parse.getData());
         
       } else {
@@ -585,7 +584,7 @@
 
       parseSegment.setLogLevel
         (Level.parse((new String(logLevel)).toUpperCase()));
-
+      
       if (threadCount != -1)
         parseSegment.setThreadCount(threadCount);
       if (showThreadID)

Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Fri Oct 14 15:10:45 2005
@@ -16,7 +16,8 @@
 
 package org.creativecommons.nutch;
 
-import org.apache.nutch.parse.*;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.protocol.Content;
 
 import java.util.Properties;
@@ -54,10 +55,9 @@
     in.close();
     byte[] bytes = out.toByteArray();
 
-    Parser parser = ParserFactory.getParser(contentType, url);
     Content content =
       new Content(url, url, bytes, contentType, new Properties());
-    Parse parse = parser.getParse(content);
+    Parse parse = ParseUtil.parseByParserId("parse-html",content);
 
     Properties metadata = parse.getData().getMetadata();
     assertEquals(license, metadata.get("License-Url"));

Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java Fri Oct 14 15:10:45 2005
@@ -21,8 +21,7 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ProtocolException;
 
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.util.ParseUtil;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseException;
 
@@ -46,8 +45,7 @@
 public class TestExtParser extends TestCase {
   private File tempFile = null;
   private String urlString = null;
-  private Content content = null;;
-  private Parser parser = null;;
+  private Content content = null;
   private Parse parse = null;
 
   private String expectedText = "nutch rocks nutch rocks nutch rocks";
@@ -107,15 +105,13 @@
       // check external parser that does 'cat'
       contentType = "application/vnd.nutch.example.cat";
       content.setContentType(contentType);
-      parser = ParserFactory.getParser(contentType, urlString);
-      parse = parser.getParse(content);
+      parse = ParseUtil.parse(content);
       assertEquals(expectedText,parse.getText());
 
       // check external parser that does 'md5sum'
       contentType = "application/vnd.nutch.example.md5sum";
       content.setContentType(contentType);
-      parser = ParserFactory.getParser(contentType, urlString);
-      parse = parser.getParse(content);
+      parse = ParseUtil.parse(content);
       assertTrue(parse.getText().startsWith(expectedMD5sum));
     }
   }

Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java Fri Oct 14 15:10:45 2005
@@ -19,7 +19,7 @@
 import junit.framework.TestCase;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParserFactory;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
@@ -60,15 +60,13 @@
     String urlString;
     Protocol protocol;
     Content content;
-    Parser parser;
     Parse parse;
 
     urlString = "file:" + sampleDir + fileSeparator + id3v2;
     protocol = ProtocolFactory.getProtocol(urlString);
     content = protocol.getContent(urlString);
 
-    parser = ParserFactory.getParser(content.getContentType(), urlString);
-    parse = parser.getParse(content);
+    parse = ParseUtil.parseByParserId("parse-mp3",content);
     Properties metadata = parse.getData().getMetadata();
     assertEquals("postgresql comment id3v2", metadata.getProperty("COMM-Text"));
     assertEquals("postgresql composer id3v2", metadata.getProperty("TCOM-Text"));

Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java Fri Oct 14 15:10:45 2005
@@ -29,8 +29,7 @@
 
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.util.ParseUtil;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
@@ -123,9 +122,7 @@
    */
   public void testContent() throws Exception {
 
-    Parser parser = ParserFactory.getParser(this.content.getContentType(),
-        this.urlString);
-    Parse parse = parser.getParse(this.content);
+    Parse parse = ParseUtil.parseByParserId("parse-mspowerpoint",this.content);
 
     ParseData data = parse.getData();
     String text = parse.getText();
@@ -162,10 +159,8 @@
    */
   public void testMeta() throws Exception {
 
-    Parser parser = ParserFactory.getParser(this.content.getContentType(),
-        this.urlString);
-    Parse parse = parser.getParse(this.content);
-
+    Parse parse = ParseUtil.parseByParserId("parse-mspowerpoint",content);
+    
     ParseData data = parse.getData();
 
     final FileExtensionFilter titleFilter = new FileExtensionFilter(

Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Fri Oct 14 15:10:45 2005
@@ -21,8 +21,7 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ProtocolException;
 
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.util.ParseUtil;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseException;
 
@@ -65,9 +64,7 @@
 
       protocol = ProtocolFactory.getProtocol(urlString);
       content = protocol.getProtocolOutput(urlString).getContent();
-
-      parser = ParserFactory.getParser(content.getContentType(), urlString);
-      parse = parser.getParse(content);
+      parse = ParseUtil.parseByParserId("parse-msword",content);
 
       assertTrue(parse.getText().startsWith(expectedText));
     }

Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java Fri Oct 14 15:10:45 2005
@@ -21,8 +21,7 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ProtocolException;
 
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.util.ParseUtil;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseException;
 
@@ -65,9 +64,7 @@
 
       protocol = ProtocolFactory.getProtocol(urlString);
       content = protocol.getProtocolOutput(urlString).getContent();
-
-      parser = ParserFactory.getParser(content.getContentType(), urlString);
-      parse = parser.getParse(content);
+      parse = ParseUtil.parseByParserId("parse-pdf",content);
 
       int index = parse.getText().indexOf(expectedText);
       assertTrue(index > 0);

Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java Fri Oct 14 15:10:45 2005
@@ -21,8 +21,7 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ProtocolException;
 
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.util.ParseUtil;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.ParseData;
@@ -75,7 +74,6 @@
         String urlString;
         Protocol protocol;
         Content content;
-        Parser parser;
         Parse parse;
 
         for (int i = 0; i < sampleFiles.length; i++) {
@@ -83,10 +81,7 @@
 
             protocol = ProtocolFactory.getProtocol(urlString);
             content = protocol.getProtocolOutput(urlString).getContent();
-
-            parser = ParserFactory.getParser(content.getContentType(),
-                    urlString);
-            parse = parser.getParse(content);
+            parse = ParseUtil.parseByParserId("parse-rss",content);
 
             //check that there are 3 outlinks:
             //http://test.channel.com

Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Fri Oct 14 15:10:45 2005
@@ -18,8 +18,8 @@
 
 import junit.framework.TestCase;
 import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.util.ParseUtil;
 import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.ParserFactory;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
@@ -58,15 +58,13 @@
     String urlString;
     Protocol protocol;
     Content content;
-    Parser parser;
     Parse parse;
 
     urlString = "file:" + sampleDir + fileSeparator + rtfFile;
     protocol = ProtocolFactory.getProtocol(urlString);
     content = protocol.getContent(urlString);
 
-    parser = ParserFactory.getParser(content.getContentType(), urlString);
-    parse = parser.getParse(content);
+    parse = ParseUtil.parseByParserId("parse-rtf",content);
     String text = parse.getText();
     assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
 

Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java Fri Oct 14 15:10:45 2005
@@ -28,9 +28,8 @@
 
 // Nutch imports
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParserFactory;
 import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.protocol.Content;
@@ -89,8 +88,7 @@
             metadata.setProperty("Content-Length", Long.toString(entry.getSize()));
             metadata.setProperty("Content-Type", contentType);
             Content content = new Content(newurl, base, b, contentType, metadata);
-            Parser parser = ParserFactory.getParser(contentType, newurl);
-            Parse parse = parser.getParse(content);
+            Parse parse = ParseUtil.parse(content);
             ParseData theParseData = parse.getData();
             Outlink[] theOutlinks = theParseData.getOutlinks();
             

Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?rev=321231&r1=321230&r2=321231&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java Fri Oct 14 15:10:45 2005
@@ -21,8 +21,7 @@
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ProtocolException;
 
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.util.ParseUtil;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseException;
 
@@ -65,9 +64,7 @@
 
       protocol = ProtocolFactory.getProtocol(urlString);
       content = protocol.getProtocolOutput(urlString).getContent();
-
-      parser = ParserFactory.getParser(content.getContentType(), urlString);
-      parse = parser.getParse(content);
+      parse = ParseUtil.parseByParserId("parse-zip",content);
       assertTrue(parse.getText().equals(expectedText));
     }
   }