You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/16 01:21:27 UTC

svn commit: r891082 - in /lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser: html/HtmlParser.java txt/TXTParser.java

Author: jukka
Date: Wed Dec 16 00:21:27 2009
New Revision: 891082

URL: http://svn.apache.org/viewvc?rev=891082&view=rev
Log:
TIKA-352: Use MediaType.parse when extracting charset from content-type metadata in parsers

Patch by Ken Krugler, plus a minor fix (handle case when content-type is not set) by me

Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=891082&r1=891081&r2=891082&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Wed Dec 16 00:21:27 2009
@@ -27,6 +27,7 @@
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.txt.CharsetDetector;
@@ -50,10 +51,6 @@
                     "Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" +
                     "([^'\\\"]+)['\\\"]\\s*/>");
     
-    // TIKA-350: handle charset as first element in content-type
-    private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile(
-                    "(?i)(?:;|)\\s*charset\\s*=\\s*([^\r;\\s]*)");
-
     /**
      * TIKA-332: Check for meta http-equiv tag with charset info in
      * HTML content.
@@ -93,14 +90,11 @@
         String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
         if (incomingCharset == null) {
             // TIKA-341: Use charset in content-type
-            String contentType = metadata.get(Metadata.CONTENT_TYPE);
-            if (contentType != null) {
-                Matcher m = CONTENT_TYPE_PATTERN.matcher(contentType);
-                if (m.find()) {
-                    String charset = m.group(1).trim();
-                    if (Charset.isSupported(charset)) {
-                        incomingCharset = charset;
-                    }
+            MediaType mt = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
+            if (mt != null) {
+                String charset = mt.getParameters().get("charset");
+                if ((charset != null) && Charset.isSupported(charset)) {
+                    incomingCharset = charset;
                 }
             }
         }

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=891082&r1=891081&r2=891082&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java Wed Dec 16 00:21:27 2009
@@ -24,13 +24,12 @@
 import java.io.Reader;
 import java.io.UnsupportedEncodingException;
 import java.nio.charset.Charset;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.DublinCore;
 import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -62,8 +61,6 @@
  */
 public class TXTParser implements Parser {
 
-    private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile("(?i);\\s*charset\\s*=\\s*(.*)");
-
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
@@ -77,14 +74,12 @@
         // Detect the content encoding (the stream is reset to the beginning)
         CharsetDetector detector = new CharsetDetector();
         String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
-        if (incomingCharset == null) {
+        String incomingType = metadata.get(Metadata.CONTENT_TYPE);
+        if (incomingCharset == null && incomingType != null) {
             // TIKA-341: Use charset in content-type
-            String contentType = metadata.get(Metadata.CONTENT_TYPE);
-            if (contentType != null) {
-                Matcher m = CONTENT_TYPE_PATTERN.matcher(contentType);
-                if (m.find()) {
-                    incomingCharset = m.group(1).trim();
-                }
+            MediaType mt = MediaType.parse(incomingType);
+            if (mt != null) {
+                incomingCharset = mt.getParameters().get("charset");
             }
         }