You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/22 19:57:15 UTC

svn commit: r1174283 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java

Author: jukka
Date: Thu Sep 22 17:57:14 2011
New Revision: 1174283

URL: http://svn.apache.org/viewvc?rev=1174283&view=rev
Log:
TIKA-508: HtmlParser link processing should skip usemap and codebase attributes

Drop codebase, data and classid from the URI_ATTRIBUTES set as their resolution
rules are more complex than what Tika currently supports. Better leave the attributes
as-is than to resolve them incorrectly.

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=1174283&r1=1174282&r2=1174283&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Thu Sep 22 17:57:14 2011
@@ -18,6 +18,7 @@ package org.apache.tika.parser.html;
 
 import java.net.MalformedURLException;
 import java.net.URL;
+import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Locale;
 import java.util.Set;
@@ -33,19 +34,11 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
-@SuppressWarnings("serial")
 class HtmlHandler extends TextContentHandler {
 
     // List of attributes that need to be resolved.
-    private static final Set<String> URI_ATTRIBUTES = new HashSet<String>() {{
-        add("src");
-        add("href");
-        add("longdesc");
-        add("data");
-        add("cite");
-        add("codebase");
-        add("classid");
-    }};
+    private static final Set<String> URI_ATTRIBUTES =
+        new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
 
     private final HtmlMapper mapper;