You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/07 18:47:50 UTC
[tika] branch master updated: TIKA-2592 -- ignore charsets not
supported by IANA in html meta-headers via Andreas Meier.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 7e2b1e7 TIKA-2592 -- ignore charsets not supported by IANA in html meta-headers via Andreas Meier.
7e2b1e7 is described below
commit 7e2b1e7534268b40c8b4ef3ee20ed708bf2e383c
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Mar 7 13:47:38 2018 -0500
TIKA-2592 -- ignore charsets not supported by IANA in html meta-headers
via Andreas Meier.
---
CHANGES.txt | 4 +
.../tika/parser/html/HtmlEncodingDetector.java | 41 +++++++
.../html/StandardCharsets_unsupported_by_IANA.txt | 125 +++++++++++++++++++++
.../apache/tika/parser/html/HtmlParserTest.java | 10 ++
.../test-documents/testHTML_charset_utf16le.html | Bin 0 -> 380 bytes
.../test-documents/testHTML_charset_utf8.html | 8 ++
6 files changed, 188 insertions(+)
diff --git a/CHANGES.txt b/CHANGES.txt
index 10ede92..3f0f31a 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,6 +5,10 @@ Release 2.0.0 - ???
Other changes
+ * Ignore non-IANA supported charsets in HTML meta-headers
+ during charset detection in HTMLEncodingDetector
+ via Andreas Meier (TIKA-2592)
+
* Add detection and parsing of zstd (if user provides
com.github.luben:zstd-jni) via Andreas Meier (TIKA-2576)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
index 559ec4d..e383f80 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
@@ -16,10 +16,17 @@
*/
package org.apache.tika.parser.html;
+import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
+import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -39,6 +46,37 @@ import org.apache.tika.utils.CharsetUtils;
*/
public class HtmlEncodingDetector implements EncodingDetector {
+ /**
+ * HTML can include non-iana supported charsets that Java
+ * recognizes, e.g. "unicode". This can lead to incorrect detection/mojibake.
+ * Ignore charsets in html meta-headers that are not supported by IANA.
+ * See: TIKA-2592
+ */
+ private static Set<String> CHARSETS_UNSUPPORTED_BY_IANA;
+ static {
+ Set<String> unsupported = new HashSet<>();
+ try (BufferedReader reader =
+ new BufferedReader(
+ new InputStreamReader(
+ HtmlEncodingDetector.class
+ .getResourceAsStream("StandardCharsets_unsupported_by_IANA.txt"),
+ StandardCharsets.UTF_8))) {
+ String line = reader.readLine();
+ while (line != null) {
+ if (line.startsWith("#")) {
+ continue;
+ }
+ line = line.trim();
+ if (line.length() > 0) {
+ unsupported.add(line.toLowerCase(Locale.US));
+ }
+ line = reader.readLine();
+ }
+ } catch (IOException e) {
+ throw new IllegalArgumentException("couldn't find StandardCharsets_unsupported_by_IANA.txt on the class path");
+ }
+ CHARSETS_UNSUPPORTED_BY_IANA = Collections.unmodifiableSet(unsupported);
+ }
// TIKA-357 - use bigger buffer for meta tag sniffing (was 4K)
private static final int DEFAULT_MARK_LIMIT = 8192;
@@ -112,6 +150,9 @@ public class HtmlEncodingDetector implements EncodingDetector {
//that is valid
while (charsetMatcher.find()) {
String candCharset = charsetMatcher.group(1);
+ if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) {
+ continue;
+ }
if (CharsetUtils.isSupported(candCharset)) {
try {
return CharsetUtils.forName(candCharset);
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt b/tika-parsers/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
new file mode 100644
index 0000000..05f76ce
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/html/StandardCharsets_unsupported_by_IANA.txt
@@ -0,0 +1,125 @@
+646
+737
+775
+813
+819
+858
+874
+8859_1
+8859_13
+8859_15
+8859_2
+8859_4
+8859_5
+8859_7
+8859_9
+912
+914
+915
+920
+923
+ansi-1251
+ascii
+ascii7
+cesu8
+cp1250
+cp1251
+cp1252
+cp1253
+cp1254
+cp1257
+cp5346
+cp5347
+cp5348
+cp5349
+cp5350
+cp5353
+cp737
+cp813
+cp858
+cp874
+cp912
+cp914
+cp915
+cp920
+cp923
+csibm862
+csisolatin0
+csisolatin9
+cspcp855
+default
+ibm-437
+ibm-737
+ibm-775
+ibm-813
+ibm-819
+ibm-850
+ibm-852
+ibm-855
+ibm-857
+ibm-862
+ibm-866
+ibm-874
+ibm-912
+ibm-914
+ibm-915
+ibm-920
+ibm-923
+ibm737
+ibm813
+ibm874
+ibm912
+ibm914
+ibm915
+ibm920
+ibm923
+iso8859-1
+iso8859-13
+iso8859-15
+iso8859-2
+iso8859-4
+iso8859-5
+iso8859-7
+iso8859-9
+iso8859_1
+iso8859_13
+iso8859_15
+iso8859_15_fdis
+iso8859_2
+iso8859_4
+iso8859_5
+iso8859_7
+iso8859_9
+iso_8859-13
+iso_8859_1
+koi8
+koi8_r
+koi8_u
+l9
+latin0
+latin9
+sun_eu_greek
+unicode
+unicode-1-1-utf-8
+unicodebig
+unicodebigunmarked
+unicodelittle
+unicodelittleunmarked
+utf-32be-bom
+utf-32le-bom
+utf16
+utf32
+utf8
+utf_16
+utf_16be
+utf_16le
+utf_32
+utf_32be
+utf_32be_bom
+utf_32le
+utf_32le_bom
+windows-437
+x-utf-16be
+x-utf-16le
+x-utf-32be
+x-utf-32le
\ No newline at end of file
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index aece610..532abed 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -1385,4 +1385,14 @@ public class HtmlParserTest extends TikaTest {
}
}
}
+
+ @Test
+ public void testCharsetsNotSupportedByIANA() throws Exception {
+ assertContains("This is a sample text",
+ getXML("testHTML_charset_utf8.html").xml);
+
+ assertContains("This is a sample text",
+ getXML("testHTML_charset_utf16le.html").xml);
+
+ }
}
diff --git a/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf16le.html b/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf16le.html
new file mode 100644
index 0000000..26cb535
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf16le.html differ
diff --git a/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf8.html b/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf8.html
new file mode 100644
index 0000000..1f61f02
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testHTML_charset_utf8.html
@@ -0,0 +1,8 @@
+<html>
+ <head>
+ <title>Title</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=unicode">
+ <style></style>
+ </head>
+ <body>This is a sample text</body>
+</html>
\ No newline at end of file
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.