You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/16 01:21:27 UTC
svn commit: r891082 - in
/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser:
html/HtmlParser.java txt/TXTParser.java
Author: jukka
Date: Wed Dec 16 00:21:27 2009
New Revision: 891082
URL: http://svn.apache.org/viewvc?rev=891082&view=rev
Log:
TIKA-352: Use MediaType.parse when extracting charset from content-type metadata in parsers
Patch by Ken Krugler, plus a minor fix (handle case when content-type is not set) by me
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=891082&r1=891081&r2=891082&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Wed Dec 16 00:21:27 2009
@@ -27,6 +27,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.txt.CharsetDetector;
@@ -50,10 +51,6 @@
"Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" +
"([^'\\\"]+)['\\\"]\\s*/>");
- // TIKA-350: handle charset as first element in content-type
- private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile(
- "(?i)(?:;|)\\s*charset\\s*=\\s*([^\r;\\s]*)");
-
/**
* TIKA-332: Check for meta http-equiv tag with charset info in
* HTML content.
@@ -93,14 +90,11 @@
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
if (incomingCharset == null) {
// TIKA-341: Use charset in content-type
- String contentType = metadata.get(Metadata.CONTENT_TYPE);
- if (contentType != null) {
- Matcher m = CONTENT_TYPE_PATTERN.matcher(contentType);
- if (m.find()) {
- String charset = m.group(1).trim();
- if (Charset.isSupported(charset)) {
- incomingCharset = charset;
- }
+ MediaType mt = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
+ if (mt != null) {
+ String charset = mt.getParameters().get("charset");
+ if ((charset != null) && Charset.isSupported(charset)) {
+ incomingCharset = charset;
}
}
}
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=891082&r1=891081&r2=891082&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java Wed Dec 16 00:21:27 2009
@@ -24,13 +24,12 @@
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -62,8 +61,6 @@
*/
public class TXTParser implements Parser {
- private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile("(?i);\\s*charset\\s*=\\s*(.*)");
-
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
@@ -77,14 +74,12 @@
// Detect the content encoding (the stream is reset to the beginning)
CharsetDetector detector = new CharsetDetector();
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
- if (incomingCharset == null) {
+ String incomingType = metadata.get(Metadata.CONTENT_TYPE);
+ if (incomingCharset == null && incomingType != null) {
// TIKA-341: Use charset in content-type
- String contentType = metadata.get(Metadata.CONTENT_TYPE);
- if (contentType != null) {
- Matcher m = CONTENT_TYPE_PATTERN.matcher(contentType);
- if (m.find()) {
- incomingCharset = m.group(1).trim();
- }
+ MediaType mt = MediaType.parse(incomingType);
+ if (mt != null) {
+ incomingCharset = mt.getParameters().get("charset");
}
}