You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2011/10/12 20:18:48 UTC

svn commit: r1182504 - in /nutch/branches/nutchgora: CHANGES.txt src/java/org/apache/nutch/parse/ParserFactory.java src/plugin/parse-html/plugin.xml

Author: lewismc
Date: Wed Oct 12 18:18:48 2011
New Revision: 1182504

URL: http://svn.apache.org/viewvc?rev=1182504&view=rev
Log:
commit to address NUTCH-1097 and update to changes.txt

Modified:
    nutch/branches/nutchgora/CHANGES.txt
    nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserFactory.java
    nutch/branches/nutchgora/src/plugin/parse-html/plugin.xml

Modified: nutch/branches/nutchgora/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1182504&r1=1182503&r2=1182504&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Wed Oct 12 18:18:48 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release nutchgora - Current Development
 
+* NUTCH-1097 application/xhtml+xml should be enabled in plugin.xml of parse-html; allow multiple mimetypes for plugin.xml (Ferdy via lewismc)
+
 * Change plugin source directory "languageidentifier" to "language-identifier" (lewismc)
 
 * NUTCH-1132, 1133 & 1134 Fix TestGenerator, TestInjector & TestFetcher respectively (lewismc)

Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserFactory.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserFactory.java?rev=1182504&r1=1182503&r2=1182504&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserFactory.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserFactory.java Wed Oct 12 18:18:48 2011
@@ -356,14 +356,13 @@ public final class ParserFactory {
       // NotMappedParserException
 
       for (int i=0; i<extensions.length; i++) {
-        if (extensions[i].getAttribute("contentType") != null
-            && extensions[i].getAttribute("contentType").equals(
-                contentType)) {
-          extList.add(extensions[i]);
-        }
-        else if ("*".equals(extensions[i].getAttribute("contentType"))){
+      	if ("*".equals(extensions[i].getAttribute("contentType"))){
           extList.add(0, extensions[i]);
         }
+      	else if (extensions[i].getAttribute("contentType") != null
+            && contentType.matches(escapeContentType(extensions[i].getAttribute("contentType")))) {
+          extList.add(extensions[i]);
+        }
       }
 
       if (extList.size() > 0) {
@@ -391,10 +390,19 @@ public final class ParserFactory {
     return (extList.size() > 0) ? extList : null;
   }
 
-  private boolean match(Extension extension, String id, String type) {
-    return ((id.equals(extension.getId())) &&
-            (type.equals(extension.getAttribute("contentType")) || extension.getAttribute("contentType").equals("*") ||
-             type.equals(DEFAULT_PLUGIN)));
+  private String escapeContentType(String contentType) {
+  	// Escapes contentType in order to use as a regex 
+  	// (and keep backwards compatibility).
+  	// This enables to accept multiple types for a single parser. 
+  	return contentType.replace("+", "\\+").replace(".", "\\.");
+	}
+
+
+	private boolean match(Extension extension, String id, String type) {
+    return (id.equals(extension.getId())) &&
+            (extension.getAttribute("contentType").equals("*") ||
+             type.matches(escapeContentType(extension.getAttribute("contentType"))) ||
+             type.equals(DEFAULT_PLUGIN));
   }
 
   /** Get an extension from its id and supported content-type. */

Modified: nutch/branches/nutchgora/src/plugin/parse-html/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/parse-html/plugin.xml?rev=1182504&r1=1182503&r2=1182504&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/parse-html/plugin.xml (original)
+++ nutch/branches/nutchgora/src/plugin/parse-html/plugin.xml Wed Oct 12 18:18:48 2011
@@ -39,7 +39,7 @@
 
       <implementation id="org.apache.nutch.parse.html.HtmlParser"
                       class="org.apache.nutch.parse.html.HtmlParser">
-        <parameter name="contentType" value="text/html"/>
+        <parameter name="contentType" value="text/html|application/xhtml+xml"/>
         <parameter name="pathSuffix" value=""/>
       </implementation>