You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/16 01:02:29 UTC

svn commit: r891075 - in /lucene/tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/html/HtmlParser.java test/java/org/apache/tika/parser/html/HtmlParserTest.java

Author: jukka
Date: Wed Dec 16 00:02:28 2009
New Revision: 891075

URL: http://svn.apache.org/viewvc?rev=891075&view=rev
Log:
TIKA-350: HtmlParser's content-type handling code needs to be more flexible

Patch by Ken Krugler

Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=891075&r1=891074&r2=891075&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Wed Dec 16 00:02:28 2009
@@ -50,8 +50,9 @@
                     "Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" +
                     "([^'\\\"]+)['\\\"]\\s*/>");
     
-    private static final Pattern CONTENT_TYPE_PATTERN =
-        Pattern.compile("(?i);\\s*charset\\s*=\\s*(.*)");
+    // TIKA-350: handle charset as first element in content-type
+    private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile(
+                    "(?i)(?:;|)\\s*charset\\s*=\\s*([^\r;\\s]*)");
 
     /**
      * TIKA-332: Check for meta http-equiv tag with charset info in
@@ -86,7 +87,8 @@
             }
         }
 
-        // No charset in a meta http-equiv tag, so detect from actual content bytes.
+        // No charset in a meta http-equiv tag, see if it's in the passed content-encoding
+        // hint, or the passed content-type hint.
         CharsetDetector detector = new CharsetDetector();
         String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
         if (incomingCharset == null) {

Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=891075&r1=891074&r2=891075&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Wed Dec 16 00:02:28 2009
@@ -336,4 +336,27 @@
         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
     }
 
+    /**
+     * Test case for TIKA-350
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-350">TIKA-350</a>
+     */
+    public void testUsingFunkyCharsetInContentTypeHeader() throws Exception {
+        final String test =
+            "<html><head><title>the name is \u00e1ndre</title></head>"
+            + "<body></body></html>";
+
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse (
+                new ByteArrayInputStream(test.getBytes("UTF-8")),
+                new BodyContentHandler(),  metadata, new ParseContext());
+        assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+        metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
+        new HtmlParser().parse (
+                new ByteArrayInputStream(test.getBytes("UTF-8")),
+                new BodyContentHandler(),  metadata, new ParseContext());
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
 }