You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/22 19:57:15 UTC
svn commit: r1174283 -
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
Author: jukka
Date: Thu Sep 22 17:57:14 2011
New Revision: 1174283
URL: http://svn.apache.org/viewvc?rev=1174283&view=rev
Log:
TIKA-508: HtmlParser link processing should skip usemap and codebase attributes
Drop codebase, data and classid from the URI_ATTRIBUTES set as their resolution
rules are more complex than what Tika currently supports. Better leave the attributes
as-is than to resolve them incorrectly.
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=1174283&r1=1174282&r2=1174283&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Thu Sep 22 17:57:14 2011
@@ -18,6 +18,7 @@ package org.apache.tika.parser.html;
import java.net.MalformedURLException;
import java.net.URL;
+import java.util.Arrays;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
@@ -33,19 +34,11 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
-@SuppressWarnings("serial")
class HtmlHandler extends TextContentHandler {
// List of attributes that need to be resolved.
- private static final Set<String> URI_ATTRIBUTES = new HashSet<String>() {{
- add("src");
- add("href");
- add("longdesc");
- add("data");
- add("cite");
- add("codebase");
- add("classid");
- }};
+ private static final Set<String> URI_ATTRIBUTES =
+ new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
private final HtmlMapper mapper;