You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2011/10/12 20:18:48 UTC
svn commit: r1182504 - in /nutch/branches/nutchgora: CHANGES.txt
src/java/org/apache/nutch/parse/ParserFactory.java
src/plugin/parse-html/plugin.xml
Author: lewismc
Date: Wed Oct 12 18:18:48 2011
New Revision: 1182504
URL: http://svn.apache.org/viewvc?rev=1182504&view=rev
Log:
commit to address NUTCH-1097 and update to changes.txt
Modified:
nutch/branches/nutchgora/CHANGES.txt
nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserFactory.java
nutch/branches/nutchgora/src/plugin/parse-html/plugin.xml
Modified: nutch/branches/nutchgora/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/CHANGES.txt?rev=1182504&r1=1182503&r2=1182504&view=diff
==============================================================================
--- nutch/branches/nutchgora/CHANGES.txt (original)
+++ nutch/branches/nutchgora/CHANGES.txt Wed Oct 12 18:18:48 2011
@@ -2,6 +2,8 @@ Nutch Change Log
Release nutchgora - Current Development
+* NUTCH-1097 application/xhtml+xml should be enabled in plugin.xml of parse-html; allow multiple mimetypes for plugin.xml (Ferdy via lewismc)
+
* Change plugin source directory "languageidentifier" to "language-identifier" (lewismc)
* NUTCH-1132, 1133 & 1134 Fix TestGenerator, TestInjector & TestFetcher respectively (lewismc)
Modified: nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserFactory.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserFactory.java?rev=1182504&r1=1182503&r2=1182504&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserFactory.java (original)
+++ nutch/branches/nutchgora/src/java/org/apache/nutch/parse/ParserFactory.java Wed Oct 12 18:18:48 2011
@@ -356,14 +356,13 @@ public final class ParserFactory {
// NotMappedParserException
for (int i=0; i<extensions.length; i++) {
- if (extensions[i].getAttribute("contentType") != null
- && extensions[i].getAttribute("contentType").equals(
- contentType)) {
- extList.add(extensions[i]);
- }
- else if ("*".equals(extensions[i].getAttribute("contentType"))){
+ if ("*".equals(extensions[i].getAttribute("contentType"))){
extList.add(0, extensions[i]);
}
+ else if (extensions[i].getAttribute("contentType") != null
+ && contentType.matches(escapeContentType(extensions[i].getAttribute("contentType")))) {
+ extList.add(extensions[i]);
+ }
}
if (extList.size() > 0) {
@@ -391,10 +390,19 @@ public final class ParserFactory {
return (extList.size() > 0) ? extList : null;
}
- private boolean match(Extension extension, String id, String type) {
- return ((id.equals(extension.getId())) &&
- (type.equals(extension.getAttribute("contentType")) || extension.getAttribute("contentType").equals("*") ||
- type.equals(DEFAULT_PLUGIN)));
+ private String escapeContentType(String contentType) {
+ // Escapes contentType in order to use as a regex
+ // (and keep backwards compatibility).
+ // This enables to accept multiple types for a single parser.
+ return contentType.replace("+", "\\+").replace(".", "\\.");
+ }
+
+
+ private boolean match(Extension extension, String id, String type) {
+ return (id.equals(extension.getId())) &&
+ (extension.getAttribute("contentType").equals("*") ||
+ type.matches(escapeContentType(extension.getAttribute("contentType"))) ||
+ type.equals(DEFAULT_PLUGIN));
}
/** Get an extension from its id and supported content-type. */
Modified: nutch/branches/nutchgora/src/plugin/parse-html/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchgora/src/plugin/parse-html/plugin.xml?rev=1182504&r1=1182503&r2=1182504&view=diff
==============================================================================
--- nutch/branches/nutchgora/src/plugin/parse-html/plugin.xml (original)
+++ nutch/branches/nutchgora/src/plugin/parse-html/plugin.xml Wed Oct 12 18:18:48 2011
@@ -39,7 +39,7 @@
<implementation id="org.apache.nutch.parse.html.HtmlParser"
class="org.apache.nutch.parse.html.HtmlParser">
- <parameter name="contentType" value="text/html"/>
+ <parameter name="contentType" value="text/html|application/xhtml+xml"/>
<parameter name="pathSuffix" value=""/>
</implementation>