You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/16 01:59:10 UTC

svn commit: r891091 - /lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Author: jukka
Date: Wed Dec 16 00:59:09 2009
New Revision: 891091

URL: http://svn.apache.org/viewvc?rev=891091&view=rev
Log:
TIKA-352: Use MediaType.parse when extracting charset from content-type metadata in parsers

Even if MediaType.parse() can now handle a null argument, it's better style to avoid relying on such an undocumented feature

Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=891091&r1=891090&r2=891091&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Wed Dec 16 00:59:09 2009
@@ -88,9 +88,10 @@
         // hint, or the passed content-type hint.
         CharsetDetector detector = new CharsetDetector();
         String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
-        if (incomingCharset == null) {
+        String incomingType = metadata.get(Metadata.CONTENT_TYPE);
+        if (incomingCharset == null && incomingType != null) {
             // TIKA-341: Use charset in content-type
-            MediaType mt = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
+            MediaType mt = MediaType.parse(incomingType);
             if (mt != null) {
                 String charset = mt.getParameters().get("charset");
                 if ((charset != null) && Charset.isSupported(charset)) {