You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2005/09/27 22:45:48 UTC

svn commit: r292035 - in /lucene/nutch/trunk: conf/ src/java/org/apache/nutch/parse/ src/test/org/apache/nutch/parse/

Author: jerome
Date: Tue Sep 27 13:45:37 2005
New Revision: 292035

URL: http://svn.apache.org/viewcvs?rev=292035&view=rev
Log:
NUTCH-88, First step proposal implementation (thanks to Chris Mattmann and Sébastien Le Callonnec)

Added:
    lucene/nutch/trunk/conf/parse-plugins.dtd   (with props)
    lucene/nutch/trunk/conf/parse-plugins.xml   (with props)
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java   (with props)
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java   (with props)
    lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java   (with props)
Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java

Added: lucene/nutch/trunk/conf/parse-plugins.dtd
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.dtd?rev=292035&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/parse-plugins.dtd (added)
+++ lucene/nutch/trunk/conf/parse-plugins.dtd Tue Sep 27 13:45:37 2005
@@ -0,0 +1,7 @@
+<!ELEMENT parse-plugins (mimeType+)>
+<!ELEMENT mimeType (plugin+)>
+<!ATTLIST mimeType name CDATA #REQUIRED>
+
+<!ELEMENT plugin EMPTY>
+<!ATTLIST plugin id CDATA #REQUIRED>
+<!ATTLIST plugin order CDATA ''>
\ No newline at end of file

Propchange: lucene/nutch/trunk/conf/parse-plugins.dtd
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/conf/parse-plugins.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=292035&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/parse-plugins.xml (added)
+++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Sep 27 13:45:37 2005
@@ -0,0 +1,207 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+	Copyright 2005 The Apache Software Foundation
+	
+	Licensed under the Apache License, Version 2.0 (the "License");
+	you may not use this file except in compliance with the License.
+	You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+	
+	Author     : mattmann 
+	Description: This xml file represents a natural ordering for which parsing 
+	plugin should get called for a particular mimeType. 
+-->
+
+<parse-plugins>
+
+	<!--  by default if the mimeType is set to *, or 
+		can't be determined, use parse-text -->
+	<mimeType name="*">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/java">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/msword">
+		<plugin id="parse-msword" />
+	</mimeType>
+
+	<mimeType name="application/pdf">
+		<plugin id="parse-pdf" />
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/postscript">
+		<plugin id="parse-pdf" />
+	</mimeType>
+
+	<mimeType name="application/rss+xml">
+		<plugin id="parse-rss" />
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/vnd.ms-excel">
+		<plugin id="parse-msexcel" />
+	</mimeType>
+
+	<mimeType name="application/vnd.ms-powerpoint">
+		<plugin id="parse-mspowerpoint" />
+	</mimeType>
+
+	<mimeType name="application/vnd.wap.wbxml">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/vnd.wap.wmlc">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/vnd.wap.wmlscriptc">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/xhtml+xml">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/x-bzip2">
+		<!--  try and parse it with the zip parser -->
+		<plugin id="parse-zip" />
+	</mimeType>
+
+	<mimeType name="application/x-csh">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/x-gzip">
+		<!--  try and parse it with the zip parser -->
+		<plugin id="parse-zip" />
+	</mimeType>
+
+	<mimeType name="application/x-javascript">
+		<plugin id="parse-js" />
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/x-kword">
+		<!--  try and parse it with the word parser -->
+		<plugin id="parse-msword" />
+	</mimeType>
+
+	<mimeType name="application/x-kspread">
+		<!--  try and parse it with the msexcel parser -->
+		<plugin id="parse-msexcel" />
+	</mimeType>
+
+	<mimeType name="application/x-latex">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/x-netcdf">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/x-sh">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/x-tcl">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/x-tex">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/x-texinfo">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/x-troff">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/x-troff-man">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/x-troff-me">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/x-troff-ms">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="application/zip">
+		<plugin id="parse-zip" />
+	</mimeType>
+
+	<mimeType name="message/news">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="message/rfc822">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="text/css">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="text/html">
+		<plugin id="parse-html" />
+	</mimeType>
+
+	<mimeType name="text/plain">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="text/richtext">
+		<plugin id="parse-rtf" />
+		<plugin id="parse-msword" />
+	</mimeType>
+
+	<mimeType name="text/rtf">
+		<plugin id="parse-rtf" />
+		<plugin id="parse-msword" />
+	</mimeType>
+
+	<mimeType name="text/sgml">
+		<plugin id="parse-html" />
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="text/tab-separated-values">
+		<plugin id="parse-msexcel" />
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="text/vnd.wap.wml">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="text/vnd.wap.wmlscript">
+		<plugin id="parse-text" />
+	</mimeType>
+
+	<mimeType name="text/xml">
+		<plugin id="parse-text" />
+		<plugin id="parse-html" />
+		<plugin id="parse-rss" />
+	</mimeType>
+
+	<mimeType name="text/x-setext">
+		<plugin id="parse-text" />
+	</mimeType>
+
+</parse-plugins>

Propchange: lucene/nutch/trunk/conf/parse-plugins.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java?rev=292035&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java Tue Sep 27 13:45:37 2005
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// JDK imports
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+
+/**
+ * This class represents a natural ordering for which parsing plugin should get
+ * called for a particular mimeType. It provides methods to store the
+ * parse-plugins.xml data, and methods to retreive the name of the appropriate
+ * parsing plugin for a contentType.
+ *
+ * @author mattmann
+ * @version 1.0
+ */
+public class ParsePluginList {
+  
+  /* a map to link mimeType to an ordered list of parsing plugins */
+  private HashMap fMimeTypeToPluginMap = null;
+  
+  /**
+   * Constructs a new ParsePluginList
+   */
+  public ParsePluginList() {
+    fMimeTypeToPluginMap = new HashMap();
+  }
+  
+  public List getPluginList(String mimeType) {
+    return (List) fMimeTypeToPluginMap.get(mimeType);
+  }
+  
+  public void setPluginList(String mimeType, List l) {
+    fMimeTypeToPluginMap.put(mimeType, l);
+  }
+  
+  public List getSupportedMimeTypes() {
+    return Arrays.asList(fMimeTypeToPluginMap.keySet().toArray(
+            new String[] {}));
+  }
+  
+}

Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=292035&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java Tue Sep 27 13:45:37 2005
@@ -0,0 +1,202 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// JDK imports
+import java.util.Iterator;
+import java.util.List;
+import java.util.Vector;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+
+// Nutch imports
+import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
+
+
+/**
+ * A reader to load the information stored in the
+ * <code>$NUTCH_HOME/conf/parse-plugins.xml</code> file.
+ *
+ * @author mattmann
+ * @version 1.0
+ */
+public class ParsePluginsReader {
+  
+  /* our log stream */
+  public static final Logger LOG = 
+          LogFormatter.getLogger(ParsePluginsReader.class.getName());
+  
+  /* the parse-plugins file */
+  private String fParsePluginsFile = "parse-plugins.xml";
+  
+  /**
+   * Constructs a new ParsePluginsReader
+   */
+  public ParsePluginsReader() {
+  }
+  
+  /**
+   * Reads the <code>parse-plugins.xml</code> file and returns the
+   * {@link ParsePluginPreferenceList} defined by it.
+   *
+   * @return A {@link ParsePluginPreferenceList} specified by the
+   *         <code>parse-plugins.xml</code> file.
+   * @throws Exception
+   *             If any parsing error occurs.
+   */
+  public ParsePluginList parse() {
+    
+    ParsePluginList pList = new ParsePluginList();
+    
+    // open up the XML file
+    DocumentBuilderFactory factory = null;
+    DocumentBuilder parser = null;
+    Document document = null;
+    InputSource inputSource = null;
+    
+    inputSource = new InputSource(NutchConf.get()
+                          .getConfResourceAsInputStream(fParsePluginsFile));
+    
+    try {
+      factory = DocumentBuilderFactory.newInstance();
+      parser = factory.newDocumentBuilder();
+      document = parser.parse(inputSource);
+    } catch (Exception e) {
+      LOG.log(Level.SEVERE, "Unable to parse [" + fParsePluginsFile + "]", e);
+      return null;
+    }
+    
+    Element parsePlugins = document.getDocumentElement();
+    
+    // get all the mime type nodes
+    
+    NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType");
+    
+    // iterate through the mime types
+    for (int i = 0; i < mimeTypes.getLength(); i++) {
+      Element mimeType = (Element) mimeTypes.item(i);
+      String mimeTypeStr = mimeType.getAttribute("name");
+      
+      // for each mimeType, get the plugin list
+      NodeList pluginList = mimeType.getElementsByTagName("plugin");
+      
+      // iterate through the plugins, add them in order read
+      // OR if they have a special order="" attribute, then hold those in
+      // a
+      // separate list, and then insert them into the final list at the
+      // order
+      // specified
+      
+      if (pluginList != null && pluginList.getLength() > 0) {
+        List plugList = new Vector(pluginList.getLength());
+        
+        for (int j = 0; j < pluginList.getLength(); j++) {
+          Element plugin = (Element) pluginList.item(j);
+          String pluginId = plugin.getAttribute("id");
+          
+          String orderStr = plugin.getAttribute("order");
+          int order = -1;
+          
+          try {
+            order = Integer.parseInt(orderStr);
+          } catch (NumberFormatException ignore) {
+          }
+          
+          if (order != -1) {
+            plugList.add(order - 1, pluginId);
+          } else {
+            plugList.add(pluginId);
+          }
+        }
+        
+        // now add the plugin list and map it to this mimeType
+        pList.setPluginList(mimeTypeStr, plugList);
+        
+      } else {
+        LOG.warning("ParsePluginsReader:ERROR:no plugins defined for mime type: "
+                    + mimeTypeStr + ", continuing parse");
+      }
+    }
+    return pList;
+  }
+  
+  /**
+   * Tests parsing of the parse-plugins.xml file. An alternative name for the
+   * file can be specified via the <code>--file</code> option, although the
+   * file must be located in the <code>$NUTCH_HOME/conf</code> directory.
+   *
+   * @param args
+   *            Currently only the --file argument to specify an alternative
+   *            name for the parse-plugins.xml file is supported.
+   */
+  public static void main(String[] args) throws Exception {
+    String parsePluginFile = null;
+    String usage = "ParsePluginsReader [--file <parse plugin file location>]";
+    
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("--file")) {
+        parsePluginFile = args[++i];
+      }
+    }
+    
+    ParsePluginsReader reader = new ParsePluginsReader();
+    
+    if (parsePluginFile != null) {
+      reader.setFParsePluginsFile(parsePluginFile);
+    }
+    
+    ParsePluginList prefs = reader.parse();
+    
+    for (Iterator i = prefs.getSupportedMimeTypes().iterator(); i.hasNext();) {
+      String mimeType = (String) i.next();
+      
+      System.out.println("MIMETYPE: " + mimeType);
+      List plugList = prefs.getPluginList(mimeType);
+      
+      System.out.println("PLUGINS:");
+      
+      for (Iterator j = plugList.iterator(); j.hasNext();) {
+        System.out.println((String) j.next());
+      }
+    }
+    
+  }
+  
+  /**
+   * @return Returns the fParsePluginsFile.
+   */
+  public String getFParsePluginsFile() {
+    return fParsePluginsFile;
+  }
+  
+  /**
+   * @param parsePluginsFile
+   *            The fParsePluginsFile to set.
+   */
+  public void setFParsePluginsFile(String parsePluginsFile) {
+    fParsePluginsFile = parsePluginsFile;
+  }
+  
+}

Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=292035&r1=292034&r2=292035&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Tue Sep 27 13:45:37 2005
@@ -13,36 +13,53 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
+// JDK imports
 import java.util.Hashtable;
-
-import org.apache.nutch.plugin.*;
-
+import java.util.Iterator;
+import java.util.List;
 import java.util.logging.Logger;
-import org.apache.nutch.util.LogFormatter;
-
-/** Creates and caches {@link Parser} plugins.*/
-public class ParserFactory {
 
-  public static final Logger LOG = LogFormatter
-    .getLogger(ParserFactory.class.getName());
+// Nutch imports
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.util.LogFormatter;
 
-  private static final ExtensionPoint X_POINT = PluginRepository.getInstance()
-      .getExtensionPoint(Parser.X_POINT_ID);
 
+/** Creates and caches {@link Parser} plugins.*/
+public final class ParserFactory {
+  
+  public static final Logger LOG =
+          LogFormatter.getLogger(ParserFactory.class.getName());
+  
+  public static final String DEFAULT_PLUGIN = "*";
+
+  private static final ExtensionPoint X_POINT =
+          PluginRepository.getInstance().getExtensionPoint(Parser.X_POINT_ID);
+  
+  private static final ParsePluginList PARSE_PLUGIN_LIST =
+          new ParsePluginsReader().parse();
+  
+  
   static {
     if (X_POINT == null) {
       throw new RuntimeException("x point "+Parser.X_POINT_ID+" not found.");
     }
+    if (PARSE_PLUGIN_LIST == null) {
+      throw new RuntimeException("Parse Plugins preferences could not be loaded.");
+    }
   }
-
+  
   private static final Hashtable CACHE = new Hashtable();
-
+  
+  
   private ParserFactory() {}                      // no public ctor
-
-  /** Returns the appropriate {@link Parser} implementation given a content
+  
+  /**
+   * Returns the appropriate {@link Parser} implementation given a content
    * type and url.
    *
    * <p>Parser extensions should define the attributes"contentType" and/or
@@ -53,78 +70,83 @@
    * first plugin whose "pathSuffix" is the empty string is used.
    */
   public static Parser getParser(String contentType, String url)
-    throws ParserNotFound {
-
+  throws ParserNotFound {
+    
     try {
-      Extension extension = getExtension(contentType, getSuffix(url));
-      if (extension == null)
+      Extension extension = getExtension(contentType);
+      if (extension != null) {
+        return (Parser) extension.getExtensionInstance();
+      }
+      // TODO once the MimeTypes is available
+      // extension = getExtension(MimeUtils.map(contentType));
+      // if (extension != null) {
+      //   return (Parser) extension.getExtensionInstance();
+      // }
+      // Last Chance: Guess content-type from file url...
+      // extension = getExtension(MimeUtils.getMimeType(url));
         throw new ParserNotFound(url, contentType);
-
-      return (Parser)extension.getExtensionInstance();
-
     } catch (PluginRuntimeException e) {
       throw new ParserNotFound(url, contentType, e.toString());
     }
   }
-
-  private static String getSuffix(String url) {
-    int i = url.lastIndexOf('.');
-    int j = url.lastIndexOf('/');
-    if (i == -1 || i == url.length()-1 || i < j)
-      return null;
-    return url.substring(i+1);
-  }
-
-
-  private static Extension getExtension(String contentType, String suffix)
-    throws PluginRuntimeException {
-
-    //LOG.fine("getExtension: contentType="+contentType+" suffix="+suffix);
-
-    String key = contentType + "+" + suffix;
-
-    if (CACHE.containsKey(key))
-      return (Extension)CACHE.get(key);
-    
-    Extension extension = findExtension(contentType, suffix);
     
-    CACHE.put(key, extension);
+  protected static Extension getExtension(String contentType)
+  throws PluginRuntimeException {
     
+    Extension extension = (Extension) CACHE.get(contentType);
+    if (extension == null) {
+      extension = findExtension(contentType);
+      // TODO: For null extension, add a fake extension in the CACHE
+      //       in order to avoid trying to find each time
+      //       an unavailable extension
+      if (extension != null) {
+        CACHE.put(contentType, extension);
+      }
+    }
     return extension;
   }
-
-  private static Extension findExtension(String contentType, String suffix)
-    throws PluginRuntimeException{
-
-    //LOG.fine("findExtension: contentType="+contentType+" suffix="+suffix);
-
+  
+  private static Extension findExtension(String contentType)
+  throws PluginRuntimeException{
+    
     Extension[] extensions = X_POINT.getExtensions();
-
-    // first look for a content-type match
-    if (contentType != null) {
-      for (int i = 0; i < extensions.length; i++) {
-        Extension extension = extensions[i];
-        if (contentType.startsWith(extension.getAttribute("contentType")))
-          return extension;                       // found a match
-      }
+    
+    // Look for a preferred plugin.
+    List parsePluginList = PARSE_PLUGIN_LIST.getPluginList(contentType);
+    Extension extension = matchExtension(parsePluginList, extensions, contentType);
+    if (extension != null) {
+      return extension;
     }
-
-    // next look for a url path suffix match
-    if (suffix != null) {
-      for (int i = 0; i < extensions.length; i++) {
-        Extension extension = extensions[i];
-        if (suffix.equals(extension.getAttribute("pathSuffix")))
-          return extension;                       // found a match
+    
+    // If none found, look for a default plugin.
+    parsePluginList = PARSE_PLUGIN_LIST.getPluginList(DEFAULT_PLUGIN);
+    return matchExtension(parsePluginList, extensions, DEFAULT_PLUGIN);
+  }
+  
+  private static Extension matchExtension(List plugins,
+                                          Extension[] extensions,
+                                          String contentType) {
+    
+    // Preliminary check
+    if (plugins == null) { return null; }
+    
+    Iterator iter = plugins.iterator();
+    while (iter.hasNext()) {
+      String pluginId = (String) iter.next();
+      if (pluginId != null) {
+        for (int i=0; i<extensions.length; i++) {
+          if (match(extensions[i], pluginId, contentType)) {
+            return extensions[i];
+          }
+        }
       }
     }
-
-    // finally, look for an extension that accepts anything
-    for (int i = 0; i < extensions.length; i++) {
-      Extension extension = extensions[i];
-      if ("".equals(extension.getAttribute("pathSuffix"))) // matches all
-        return extension;
-    }
-
     return null;
+  }
+
+  private static boolean match(Extension extension, String id, String type) {
+    return (id.equals(extension.getDescriptor().getPluginId())) &&
+              (type.equals(extension.getAttribute("contentType")) ||
+              (type.equals(DEFAULT_PLUGIN))); 
   }
 }

Added: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java?rev=292035&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java Tue Sep 27 13:45:37 2005
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+// JUnit imports
+import junit.framework.TestCase;
+
+// Nutch imports
+import org.apache.nutch.plugin.Extension;
+
+
+/**
+ * Unit test for new parse plugin selection.
+ *
+ * @author Sebastien Le Callonnec
+ * @version 1.0
+ */
+public class TestParserFactory extends TestCase {
+  
+  public TestParserFactory(String name) { super(name); }
+  
+  
+  /** Unit test for <code>getParser(String, String)</code> method. */
+  public void testGetParser() throws Exception {
+    Parser  parser = ParserFactory.getParser("text/html", "http://foo.com/");
+    assertNotNull(parser);
+    parser  = ParserFactory.getParser("foo/bar", "http://foo.com/");
+    assertNotNull(parser);
+  }
+  
+  /** Unit test for <code>getExtension(String)</code> method. */
+  public void testGetExtension() throws Exception {
+    Extension ext = ParserFactory.getExtension("text/html");
+    assertEquals("parse-html", ext.getDescriptor().getPluginId());
+    ext = ParserFactory.getExtension("foo/bar");
+    assertEquals("parse-text", ext.getDescriptor().getPluginId());
+  }
+  
+}

Propchange: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native