You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2005/09/27 22:45:48 UTC
svn commit: r292035 - in /lucene/nutch/trunk: conf/
src/java/org/apache/nutch/parse/ src/test/org/apache/nutch/parse/
Author: jerome
Date: Tue Sep 27 13:45:37 2005
New Revision: 292035
URL: http://svn.apache.org/viewcvs?rev=292035&view=rev
Log:
NUTCH-88, First step proposal implementation (thanks to Chris Mattmann and Sébastien Le Callonnec)
Added:
lucene/nutch/trunk/conf/parse-plugins.dtd (with props)
lucene/nutch/trunk/conf/parse-plugins.xml (with props)
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java (with props)
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java (with props)
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java (with props)
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
Added: lucene/nutch/trunk/conf/parse-plugins.dtd
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.dtd?rev=292035&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/parse-plugins.dtd (added)
+++ lucene/nutch/trunk/conf/parse-plugins.dtd Tue Sep 27 13:45:37 2005
@@ -0,0 +1,7 @@
+<!ELEMENT parse-plugins (mimeType+)>
+<!ELEMENT mimeType (plugin+)>
+<!ATTLIST mimeType name CDATA #REQUIRED>
+
+<!ELEMENT plugin EMPTY>
+<!ATTLIST plugin id CDATA #REQUIRED>
+<!ATTLIST plugin order CDATA ''>
\ No newline at end of file
Propchange: lucene/nutch/trunk/conf/parse-plugins.dtd
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/conf/parse-plugins.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=292035&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/parse-plugins.xml (added)
+++ lucene/nutch/trunk/conf/parse-plugins.xml Tue Sep 27 13:45:37 2005
@@ -0,0 +1,207 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2005 The Apache Software Foundation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+ Author : mattmann
+ Description: This xml file represents a natural ordering for which parsing
+ plugin should get called for a particular mimeType.
+-->
+
+<parse-plugins>
+
+ <!-- by default if the mimeType is set to *, or
+ can't be determined, use parse-text -->
+ <mimeType name="*">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/java">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/msword">
+ <plugin id="parse-msword" />
+ </mimeType>
+
+ <mimeType name="application/pdf">
+ <plugin id="parse-pdf" />
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/postscript">
+ <plugin id="parse-pdf" />
+ </mimeType>
+
+ <mimeType name="application/rss+xml">
+ <plugin id="parse-rss" />
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/vnd.ms-excel">
+ <plugin id="parse-msexcel" />
+ </mimeType>
+
+ <mimeType name="application/vnd.ms-powerpoint">
+ <plugin id="parse-mspowerpoint" />
+ </mimeType>
+
+ <mimeType name="application/vnd.wap.wbxml">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/vnd.wap.wmlc">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/vnd.wap.wmlscriptc">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/xhtml+xml">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/x-bzip2">
+ <!-- try and parse it with the zip parser -->
+ <plugin id="parse-zip" />
+ </mimeType>
+
+ <mimeType name="application/x-csh">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/x-gzip">
+ <!-- try and parse it with the zip parser -->
+ <plugin id="parse-zip" />
+ </mimeType>
+
+ <mimeType name="application/x-javascript">
+ <plugin id="parse-js" />
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/x-kword">
+ <!-- try and parse it with the word parser -->
+ <plugin id="parse-msword" />
+ </mimeType>
+
+ <mimeType name="application/x-kspread">
+ <!-- try and parse it with the msexcel parser -->
+ <plugin id="parse-msexcel" />
+ </mimeType>
+
+ <mimeType name="application/x-latex">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/x-netcdf">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/x-sh">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/x-tcl">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/x-tex">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/x-texinfo">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/x-troff">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/x-troff-man">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/x-troff-me">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/x-troff-ms">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="application/zip">
+ <plugin id="parse-zip" />
+ </mimeType>
+
+ <mimeType name="message/news">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="message/rfc822">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="text/css">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="text/html">
+ <plugin id="parse-html" />
+ </mimeType>
+
+ <mimeType name="text/plain">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="text/richtext">
+ <plugin id="parse-rtf" />
+ <plugin id="parse-msword" />
+ </mimeType>
+
+ <mimeType name="text/rtf">
+ <plugin id="parse-rtf" />
+ <plugin id="parse-msword" />
+ </mimeType>
+
+ <mimeType name="text/sgml">
+ <plugin id="parse-html" />
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="text/tab-separated-values">
+ <plugin id="parse-msexcel" />
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="text/vnd.wap.wml">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="text/vnd.wap.wmlscript">
+ <plugin id="parse-text" />
+ </mimeType>
+
+ <mimeType name="text/xml">
+ <plugin id="parse-text" />
+ <plugin id="parse-html" />
+ <plugin id="parse-rss" />
+ </mimeType>
+
+ <mimeType name="text/x-setext">
+ <plugin id="parse-text" />
+ </mimeType>
+
+</parse-plugins>
Propchange: lucene/nutch/trunk/conf/parse-plugins.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java?rev=292035&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java Tue Sep 27 13:45:37 2005
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// JDK imports
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+
+/**
+ * This class represents a natural ordering for which parsing plugin should get
+ * called for a particular mimeType. It provides methods to store the
+ * parse-plugins.xml data, and methods to retreive the name of the appropriate
+ * parsing plugin for a contentType.
+ *
+ * @author mattmann
+ * @version 1.0
+ */
+public class ParsePluginList {
+
+ /* a map to link mimeType to an ordered list of parsing plugins */
+ private HashMap fMimeTypeToPluginMap = null;
+
+ /**
+ * Constructs a new ParsePluginList
+ */
+ public ParsePluginList() {
+ fMimeTypeToPluginMap = new HashMap();
+ }
+
+ public List getPluginList(String mimeType) {
+ return (List) fMimeTypeToPluginMap.get(mimeType);
+ }
+
+ public void setPluginList(String mimeType, List l) {
+ fMimeTypeToPluginMap.put(mimeType, l);
+ }
+
+ public List getSupportedMimeTypes() {
+ return Arrays.asList(fMimeTypeToPluginMap.keySet().toArray(
+ new String[] {}));
+ }
+
+}
Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=292035&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java Tue Sep 27 13:45:37 2005
@@ -0,0 +1,202 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// JDK imports
+import java.util.Iterator;
+import java.util.List;
+import java.util.Vector;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+
+// Nutch imports
+import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
+
+
+/**
+ * A reader to load the information stored in the
+ * <code>$NUTCH_HOME/conf/parse-plugins.xml</code> file.
+ *
+ * @author mattmann
+ * @version 1.0
+ */
+public class ParsePluginsReader {
+
+ /* our log stream */
+ public static final Logger LOG =
+ LogFormatter.getLogger(ParsePluginsReader.class.getName());
+
+ /* the parse-plugins file */
+ private String fParsePluginsFile = "parse-plugins.xml";
+
+ /**
+ * Constructs a new ParsePluginsReader
+ */
+ public ParsePluginsReader() {
+ }
+
+ /**
+ * Reads the <code>parse-plugins.xml</code> file and returns the
+ * {@link ParsePluginPreferenceList} defined by it.
+ *
+ * @return A {@link ParsePluginPreferenceList} specified by the
+ * <code>parse-plugins.xml</code> file.
+ * @throws Exception
+ * If any parsing error occurs.
+ */
+ public ParsePluginList parse() {
+
+ ParsePluginList pList = new ParsePluginList();
+
+ // open up the XML file
+ DocumentBuilderFactory factory = null;
+ DocumentBuilder parser = null;
+ Document document = null;
+ InputSource inputSource = null;
+
+ inputSource = new InputSource(NutchConf.get()
+ .getConfResourceAsInputStream(fParsePluginsFile));
+
+ try {
+ factory = DocumentBuilderFactory.newInstance();
+ parser = factory.newDocumentBuilder();
+ document = parser.parse(inputSource);
+ } catch (Exception e) {
+ LOG.log(Level.SEVERE, "Unable to parse [" + fParsePluginsFile + "]", e);
+ return null;
+ }
+
+ Element parsePlugins = document.getDocumentElement();
+
+ // get all the mime type nodes
+
+ NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType");
+
+ // iterate through the mime types
+ for (int i = 0; i < mimeTypes.getLength(); i++) {
+ Element mimeType = (Element) mimeTypes.item(i);
+ String mimeTypeStr = mimeType.getAttribute("name");
+
+ // for each mimeType, get the plugin list
+ NodeList pluginList = mimeType.getElementsByTagName("plugin");
+
+ // iterate through the plugins, add them in order read
+ // OR if they have a special order="" attribute, then hold those in
+ // a
+ // separate list, and then insert them into the final list at the
+ // order
+ // specified
+
+ if (pluginList != null && pluginList.getLength() > 0) {
+ List plugList = new Vector(pluginList.getLength());
+
+ for (int j = 0; j < pluginList.getLength(); j++) {
+ Element plugin = (Element) pluginList.item(j);
+ String pluginId = plugin.getAttribute("id");
+
+ String orderStr = plugin.getAttribute("order");
+ int order = -1;
+
+ try {
+ order = Integer.parseInt(orderStr);
+ } catch (NumberFormatException ignore) {
+ }
+
+ if (order != -1) {
+ plugList.add(order - 1, pluginId);
+ } else {
+ plugList.add(pluginId);
+ }
+ }
+
+ // now add the plugin list and map it to this mimeType
+ pList.setPluginList(mimeTypeStr, plugList);
+
+ } else {
+ LOG.warning("ParsePluginsReader:ERROR:no plugins defined for mime type: "
+ + mimeTypeStr + ", continuing parse");
+ }
+ }
+ return pList;
+ }
+
+ /**
+ * Tests parsing of the parse-plugins.xml file. An alternative name for the
+ * file can be specified via the <code>--file</code> option, although the
+ * file must be located in the <code>$NUTCH_HOME/conf</code> directory.
+ *
+ * @param args
+ * Currently only the --file argument to specify an alternative
+ * name for the parse-plugins.xml file is supported.
+ */
+ public static void main(String[] args) throws Exception {
+ String parsePluginFile = null;
+ String usage = "ParsePluginsReader [--file <parse plugin file location>]";
+
+ for (int i = 0; i < args.length; i++) {
+ if (args[i].equals("--file")) {
+ parsePluginFile = args[++i];
+ }
+ }
+
+ ParsePluginsReader reader = new ParsePluginsReader();
+
+ if (parsePluginFile != null) {
+ reader.setFParsePluginsFile(parsePluginFile);
+ }
+
+ ParsePluginList prefs = reader.parse();
+
+ for (Iterator i = prefs.getSupportedMimeTypes().iterator(); i.hasNext();) {
+ String mimeType = (String) i.next();
+
+ System.out.println("MIMETYPE: " + mimeType);
+ List plugList = prefs.getPluginList(mimeType);
+
+ System.out.println("PLUGINS:");
+
+ for (Iterator j = plugList.iterator(); j.hasNext();) {
+ System.out.println((String) j.next());
+ }
+ }
+
+ }
+
+ /**
+ * @return Returns the fParsePluginsFile.
+ */
+ public String getFParsePluginsFile() {
+ return fParsePluginsFile;
+ }
+
+ /**
+ * @param parsePluginsFile
+ * The fParsePluginsFile to set.
+ */
+ public void setFParsePluginsFile(String parsePluginsFile) {
+ fParsePluginsFile = parsePluginsFile;
+ }
+
+}
Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=292035&r1=292034&r2=292035&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Tue Sep 27 13:45:37 2005
@@ -13,36 +13,53 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.apache.nutch.parse;
+// JDK imports
import java.util.Hashtable;
-
-import org.apache.nutch.plugin.*;
-
+import java.util.Iterator;
+import java.util.List;
import java.util.logging.Logger;
-import org.apache.nutch.util.LogFormatter;
-
-/** Creates and caches {@link Parser} plugins.*/
-public class ParserFactory {
- public static final Logger LOG = LogFormatter
- .getLogger(ParserFactory.class.getName());
+// Nutch imports
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.util.LogFormatter;
- private static final ExtensionPoint X_POINT = PluginRepository.getInstance()
- .getExtensionPoint(Parser.X_POINT_ID);
+/** Creates and caches {@link Parser} plugins.*/
+public final class ParserFactory {
+
+ public static final Logger LOG =
+ LogFormatter.getLogger(ParserFactory.class.getName());
+
+ public static final String DEFAULT_PLUGIN = "*";
+
+ private static final ExtensionPoint X_POINT =
+ PluginRepository.getInstance().getExtensionPoint(Parser.X_POINT_ID);
+
+ private static final ParsePluginList PARSE_PLUGIN_LIST =
+ new ParsePluginsReader().parse();
+
+
static {
if (X_POINT == null) {
throw new RuntimeException("x point "+Parser.X_POINT_ID+" not found.");
}
+ if (PARSE_PLUGIN_LIST == null) {
+ throw new RuntimeException("Parse Plugins preferences could not be loaded.");
+ }
}
-
+
private static final Hashtable CACHE = new Hashtable();
-
+
+
private ParserFactory() {} // no public ctor
-
- /** Returns the appropriate {@link Parser} implementation given a content
+
+ /**
+ * Returns the appropriate {@link Parser} implementation given a content
* type and url.
*
* <p>Parser extensions should define the attributes"contentType" and/or
@@ -53,78 +70,83 @@
* first plugin whose "pathSuffix" is the empty string is used.
*/
public static Parser getParser(String contentType, String url)
- throws ParserNotFound {
-
+ throws ParserNotFound {
+
try {
- Extension extension = getExtension(contentType, getSuffix(url));
- if (extension == null)
+ Extension extension = getExtension(contentType);
+ if (extension != null) {
+ return (Parser) extension.getExtensionInstance();
+ }
+ // TODO once the MimeTypes is available
+ // extension = getExtension(MimeUtils.map(contentType));
+ // if (extension != null) {
+ // return (Parser) extension.getExtensionInstance();
+ // }
+ // Last Chance: Guess content-type from file url...
+ // extension = getExtension(MimeUtils.getMimeType(url));
throw new ParserNotFound(url, contentType);
-
- return (Parser)extension.getExtensionInstance();
-
} catch (PluginRuntimeException e) {
throw new ParserNotFound(url, contentType, e.toString());
}
}
-
- private static String getSuffix(String url) {
- int i = url.lastIndexOf('.');
- int j = url.lastIndexOf('/');
- if (i == -1 || i == url.length()-1 || i < j)
- return null;
- return url.substring(i+1);
- }
-
-
- private static Extension getExtension(String contentType, String suffix)
- throws PluginRuntimeException {
-
- //LOG.fine("getExtension: contentType="+contentType+" suffix="+suffix);
-
- String key = contentType + "+" + suffix;
-
- if (CACHE.containsKey(key))
- return (Extension)CACHE.get(key);
-
- Extension extension = findExtension(contentType, suffix);
- CACHE.put(key, extension);
+ protected static Extension getExtension(String contentType)
+ throws PluginRuntimeException {
+ Extension extension = (Extension) CACHE.get(contentType);
+ if (extension == null) {
+ extension = findExtension(contentType);
+ // TODO: For null extension, add a fake extension in the CACHE
+ // in order to avoid trying to find each time
+ // an unavailable extension
+ if (extension != null) {
+ CACHE.put(contentType, extension);
+ }
+ }
return extension;
}
-
- private static Extension findExtension(String contentType, String suffix)
- throws PluginRuntimeException{
-
- //LOG.fine("findExtension: contentType="+contentType+" suffix="+suffix);
-
+
+ private static Extension findExtension(String contentType)
+ throws PluginRuntimeException{
+
Extension[] extensions = X_POINT.getExtensions();
-
- // first look for a content-type match
- if (contentType != null) {
- for (int i = 0; i < extensions.length; i++) {
- Extension extension = extensions[i];
- if (contentType.startsWith(extension.getAttribute("contentType")))
- return extension; // found a match
- }
+
+ // Look for a preferred plugin.
+ List parsePluginList = PARSE_PLUGIN_LIST.getPluginList(contentType);
+ Extension extension = matchExtension(parsePluginList, extensions, contentType);
+ if (extension != null) {
+ return extension;
}
-
- // next look for a url path suffix match
- if (suffix != null) {
- for (int i = 0; i < extensions.length; i++) {
- Extension extension = extensions[i];
- if (suffix.equals(extension.getAttribute("pathSuffix")))
- return extension; // found a match
+
+ // If none found, look for a default plugin.
+ parsePluginList = PARSE_PLUGIN_LIST.getPluginList(DEFAULT_PLUGIN);
+ return matchExtension(parsePluginList, extensions, DEFAULT_PLUGIN);
+ }
+
+ private static Extension matchExtension(List plugins,
+ Extension[] extensions,
+ String contentType) {
+
+ // Preliminary check
+ if (plugins == null) { return null; }
+
+ Iterator iter = plugins.iterator();
+ while (iter.hasNext()) {
+ String pluginId = (String) iter.next();
+ if (pluginId != null) {
+ for (int i=0; i<extensions.length; i++) {
+ if (match(extensions[i], pluginId, contentType)) {
+ return extensions[i];
+ }
+ }
}
}
-
- // finally, look for an extension that accepts anything
- for (int i = 0; i < extensions.length; i++) {
- Extension extension = extensions[i];
- if ("".equals(extension.getAttribute("pathSuffix"))) // matches all
- return extension;
- }
-
return null;
+ }
+
+ private static boolean match(Extension extension, String id, String type) {
+ return (id.equals(extension.getDescriptor().getPluginId())) &&
+ (type.equals(extension.getAttribute("contentType")) ||
+ (type.equals(DEFAULT_PLUGIN)));
}
}
Added: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java?rev=292035&view=auto
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java Tue Sep 27 13:45:37 2005
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+// JUnit imports
+import junit.framework.TestCase;
+
+// Nutch imports
+import org.apache.nutch.plugin.Extension;
+
+
+/**
+ * Unit test for new parse plugin selection.
+ *
+ * @author Sebastien Le Callonnec
+ * @version 1.0
+ */
+public class TestParserFactory extends TestCase {
+
+ public TestParserFactory(String name) { super(name); }
+
+
+ /** Unit test for <code>getParser(String, String)</code> method. */
+ public void testGetParser() throws Exception {
+ Parser parser = ParserFactory.getParser("text/html", "http://foo.com/");
+ assertNotNull(parser);
+ parser = ParserFactory.getParser("foo/bar", "http://foo.com/");
+ assertNotNull(parser);
+ }
+
+ /** Unit test for <code>getExtension(String)</code> method. */
+ public void testGetExtension() throws Exception {
+ Extension ext = ParserFactory.getExtension("text/html");
+ assertEquals("parse-html", ext.getDescriptor().getPluginId());
+ ext = ParserFactory.getExtension("foo/bar");
+ assertEquals("parse-text", ext.getDescriptor().getPluginId());
+ }
+
+}
Propchange: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
------------------------------------------------------------------------------
svn:eol-style = native