You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/13 01:13:49 UTC

svn commit: r890009 - in /lucene/tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/html/HtmlParser.java test/java/org/apache/tika/parser/html/HtmlParserTest.java

Author: jukka
Date: Sun Dec 13 00:13:49 2009
New Revision: 890009

URL: http://svn.apache.org/viewvc?rev=890009&view=rev
Log:
TIKA-332: Use http-equiv meta tag charset info when processing HTML documents

Patches by Ken Krugler.

Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=890009&r1=890008&r2=890009&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Sun Dec 13 00:13:49 2009
@@ -16,9 +16,13 @@
  */
 package org.apache.tika.parser.html;
 
+import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.nio.charset.Charset;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.CloseShieldInputStream;
@@ -40,12 +44,37 @@
 
     // Use the widest, most common charset as our default.
     private static final String DEFAULT_CHARSET = "windows-1252";
+    private static final int META_TAG_BUFFER_SIZE = 4096;
+    private static final Pattern HTTP_EQUIV_CHARSET_PATTERN = Pattern.compile(
+            "(?is)<meta\\s+http-equiv\\s*=\\s*['\"]\\s*Content-Type['\"]\\s+"
+            + "content\\s*=\\s*['\"][^;]+;\\s*charset\\s*=\\s*([^'\"]+)\"");
 
     // TODO: Move this into core, along with CharsetDetector
     private String getEncoding(InputStream stream, Metadata metadata) throws IOException {
-        // TODO: Check for <meta tag in stream. If that exists and is supported, then
-        // set that in metadata and return.
+        // TIKA-332: Check for meta http-equiv tag with charset info in HTML content
+        if (!stream.markSupported()) {
+            stream = new BufferedInputStream(stream);
+        }
+
+        stream.mark(META_TAG_BUFFER_SIZE);
+        char[] buffer = new char[META_TAG_BUFFER_SIZE];
+        InputStreamReader isr = new InputStreamReader(stream, "us-ascii");
+        int bufferSize = isr.read(buffer);
+        stream.reset();
+
+        if (bufferSize != -1) {
+            String metaString = new String(buffer, 0, bufferSize);
+            Matcher m = HTTP_EQUIV_CHARSET_PATTERN.matcher(metaString);
+            if (m.find()) {
+                String charset = m.group(1);
+                if (Charset.isSupported(charset)) {
+                    metadata.set(Metadata.CONTENT_ENCODING, charset);
+                    return charset;
+                }
+            }
+        }
 
+        // No charset in a meta http-equiv tag, so detect from actual content bytes.
         CharsetDetector detector = new CharsetDetector();
         String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
         if (incomingCharset == null) {

Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=890009&r1=890008&r2=890009&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Sun Dec 13 00:13:49 2009
@@ -220,11 +220,29 @@
     }
 
     /**
+     * Test case for TIKA-332
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-332">TIKA-332</a>
+     */
+    public void testHttpEquivCharset() throws Exception {
+        String test =
+            "<html><head><meta http-equiv=\"content-type\""
+            + " content=\"text/html; charset=ISO-8859-1\" />"
+            + "<title>the name is \u00e1ndre</title>"
+            + "</head><body></body></html>";
+        Metadata metadata = new Metadata();
+        new HtmlParser().parse (
+                new ByteArrayInputStream(test.getBytes("UTF-8")),
+                new BodyContentHandler(),  metadata, new ParseContext());
+        assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    /**
      * Test case for TIKA-334
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
      */
     public void testDetectOfCharset() throws Exception {
-        String test = "<html><title>\u017d</title><body></body></html>";
+        String test =
+            "<html><head><title>\u017d</title></head><body></body></html>";
         Metadata metadata = new Metadata();
         new HtmlParser().parse (
                 new ByteArrayInputStream(test.getBytes("UTF-8")),