You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/16 01:59:10 UTC
svn commit: r891091 -
/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Author: jukka
Date: Wed Dec 16 00:59:09 2009
New Revision: 891091
URL: http://svn.apache.org/viewvc?rev=891091&view=rev
Log:
TIKA-352: Use MediaType.parse when extracting charset from content-type metadata in parsers
Even if MediaType.parse() can now handle a null argument, it's better style to avoid relying on such an undocumented feature
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=891091&r1=891090&r2=891091&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Wed Dec 16 00:59:09 2009
@@ -88,9 +88,10 @@
// hint, or the passed content-type hint.
CharsetDetector detector = new CharsetDetector();
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
- if (incomingCharset == null) {
+ String incomingType = metadata.get(Metadata.CONTENT_TYPE);
+ if (incomingCharset == null && incomingType != null) {
// TIKA-341: Use charset in content-type
- MediaType mt = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
+ MediaType mt = MediaType.parse(incomingType);
if (mt != null) {
String charset = mt.getParameters().get("charset");
if ((charset != null) && Charset.isSupported(charset)) {