You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/13 01:13:49 UTC
svn commit: r890009 - in /lucene/tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/html/HtmlParser.java
test/java/org/apache/tika/parser/html/HtmlParserTest.java
Author: jukka
Date: Sun Dec 13 00:13:49 2009
New Revision: 890009
URL: http://svn.apache.org/viewvc?rev=890009&view=rev
Log:
TIKA-332: Use http-equiv meta tag charset info when processing HTML documents
Patches by Ken Krugler.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=890009&r1=890008&r2=890009&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Sun Dec 13 00:13:49 2009
@@ -16,9 +16,13 @@
*/
package org.apache.tika.parser.html;
+import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.io.InputStreamReader;
import java.nio.charset.Charset;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
@@ -40,12 +44,37 @@
// Use the widest, most common charset as our default.
private static final String DEFAULT_CHARSET = "windows-1252";
+ private static final int META_TAG_BUFFER_SIZE = 4096;
+ private static final Pattern HTTP_EQUIV_CHARSET_PATTERN = Pattern.compile(
+ "(?is)<meta\\s+http-equiv\\s*=\\s*['\"]\\s*Content-Type['\"]\\s+"
+ + "content\\s*=\\s*['\"][^;]+;\\s*charset\\s*=\\s*([^'\"]+)\"");
// TODO: Move this into core, along with CharsetDetector
private String getEncoding(InputStream stream, Metadata metadata) throws IOException {
- // TODO: Check for <meta tag in stream. If that exists and is supported, then
- // set that in metadata and return.
+ // TIKA-332: Check for meta http-equiv tag with charset info in HTML content
+ if (!stream.markSupported()) {
+ stream = new BufferedInputStream(stream);
+ }
+
+ stream.mark(META_TAG_BUFFER_SIZE);
+ char[] buffer = new char[META_TAG_BUFFER_SIZE];
+ InputStreamReader isr = new InputStreamReader(stream, "us-ascii");
+ int bufferSize = isr.read(buffer);
+ stream.reset();
+
+ if (bufferSize != -1) {
+ String metaString = new String(buffer, 0, bufferSize);
+ Matcher m = HTTP_EQUIV_CHARSET_PATTERN.matcher(metaString);
+ if (m.find()) {
+ String charset = m.group(1);
+ if (Charset.isSupported(charset)) {
+ metadata.set(Metadata.CONTENT_ENCODING, charset);
+ return charset;
+ }
+ }
+ }
+ // No charset in a meta http-equiv tag, so detect from actual content bytes.
CharsetDetector detector = new CharsetDetector();
String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
if (incomingCharset == null) {
Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=890009&r1=890008&r2=890009&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Sun Dec 13 00:13:49 2009
@@ -220,11 +220,29 @@
}
/**
+ * Test case for TIKA-332
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-332">TIKA-332</a>
+ */
+ public void testHttpEquivCharset() throws Exception {
+ String test =
+ "<html><head><meta http-equiv=\"content-type\""
+ + " content=\"text/html; charset=ISO-8859-1\" />"
+ + "<title>the name is \u00e1ndre</title>"
+ + "</head><body></body></html>";
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse (
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
+ /**
* Test case for TIKA-334
* @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
*/
public void testDetectOfCharset() throws Exception {
- String test = "<html><title>\u017d</title><body></body></html>";
+ String test =
+ "<html><head><title>\u017d</title></head><body></body></html>";
Metadata metadata = new Metadata();
new HtmlParser().parse (
new ByteArrayInputStream(test.getBytes("UTF-8")),