You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/16 00:56:04 UTC

svn commit: r891074 - in /lucene/tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/html/HtmlParser.java test/java/org/apache/tika/parser/html/HtmlParserTest.java

Author: jukka
Date: Tue Dec 15 23:56:03 2009
New Revision: 891074

URL: http://svn.apache.org/viewvc?rev=891074&view=rev
Log:
TIKA-349: HtmlParser's http-equiv code needs to be more flexible

Patch by Ken Krugler

Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=891074&r1=891073&r2=891074&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Tue Dec 15 23:56:03 2009
@@ -45,10 +45,11 @@
     // Use the widest, most common charset as our default.
     private static final String DEFAULT_CHARSET = "windows-1252";
     private static final int META_TAG_BUFFER_SIZE = 4096;
-    private static final Pattern HTTP_EQUIV_CHARSET_PATTERN = Pattern.compile(
-            "(?is)<meta\\s+http-equiv\\s*=\\s*['\"]\\s*Content-Type['\"]\\s+"
-            + "content\\s*=\\s*['\"][^;]+;\\s*charset\\s*=\\s*([^'\"]+)\"");
-
+    private static final Pattern HTTP_EQUIV_PATTERN = Pattern.compile(
+                    "(?is)<meta\\s+http-equiv\\s*=\\s*['\\\"]\\s*" +
+                    "Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" +
+                    "([^'\\\"]+)['\\\"]\\s*/>");
+    
     private static final Pattern CONTENT_TYPE_PATTERN =
         Pattern.compile("(?i);\\s*charset\\s*=\\s*(.*)");
 
@@ -67,12 +68,20 @@
 
         if (bufferSize != -1) {
             String metaString = new String(buffer, 0, bufferSize);
-            Matcher m = HTTP_EQUIV_CHARSET_PATTERN.matcher(metaString);
+            Matcher m = HTTP_EQUIV_PATTERN.matcher(metaString);
             if (m.find()) {
-                String charset = m.group(1);
-                if (Charset.isSupported(charset)) {
-                    metadata.set(Metadata.CONTENT_ENCODING, charset);
-                    return charset;
+                // TIKA-349: flexible handling of attributes
+                // We have one or more x or x=y attributes, separated by ';'
+                String[] attrs = m.group(1).split(";");
+                for (String attr : attrs) {
+                    String[] keyValue = attr.trim().split("=");
+                    if ((keyValue.length == 2) && keyValue[0].equalsIgnoreCase("charset")) {
+                        String charset = keyValue[1];
+                        if (Charset.isSupported(charset)) {
+                            metadata.set(Metadata.CONTENT_ENCODING, charset);
+                            return charset;
+                        }
+                    }
                 }
             }
         }

Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=891074&r1=891073&r2=891074&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Tue Dec 15 23:56:03 2009
@@ -252,7 +252,7 @@
 
     /**
      * Test case for TIKA-341
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-XXX">TIKA-XXX</a>
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
      */
     public void testUsingCharsetInContentTypeHeader() throws Exception {
         final String test =
@@ -307,4 +307,33 @@
         assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
     }
 
+    /**
+     * Test case for TIKA-349
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-349">TIKA-349</a>
+     */
+    public void testHttpEquivCharsetFunkyAttributes() throws Exception {
+        String test1 =
+            "<html><head><meta http-equiv=\"content-type\""
+            + " content=\"text/html; charset=ISO-8859-1; charset=iso-8859-1\" />"
+            + "<title>the name is \u00e1ndre</title>"
+            + "</head><body></body></html>";
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse (
+                new ByteArrayInputStream(test1.getBytes("UTF-8")),
+                new BodyContentHandler(),  metadata, new ParseContext());
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+
+        // Some HTML pages have errors like ';;' versus '; ' as separator
+        String test2 =
+            "<html><head><meta http-equiv=\"content-type\""
+            + " content=\"text/html;;charset=ISO-8859-1\" />"
+            + "<title>the name is \u00e1ndre</title>"
+            + "</head><body></body></html>";
+        metadata = new Metadata();
+        new HtmlParser().parse (
+                new ByteArrayInputStream(test2.getBytes("UTF-8")),
+                new BodyContentHandler(),  metadata, new ParseContext());
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
 }