You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/03 15:32:59 UTC
[tika] branch branch_1x updated: TIKA-2673 -- add
StandardHtmlEncodingDetector via Gerard Bouchar
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 6badaea TIKA-2673 -- add StandardHtmlEncodingDetector via Gerard Bouchar
6badaea is described below
commit 6badaead79e3350414536a5e4972871f66e97e90
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Aug 3 11:32:49 2018 -0400
TIKA-2673 -- add StandardHtmlEncodingDetector via Gerard Bouchar
---
.../parser/html/StrictHtmlEncodingDetector.java | 491 ---------------------
.../html/charsetdetector/CharsetAliases.java | 145 ++++++
.../charsetdetector/CharsetDetectionResult.java | 62 +++
.../parser/html/charsetdetector/MetaProcessor.java | 74 ++++
.../parser/html/charsetdetector/PreScanner.java | 270 +++++++++++
.../StandardHtmlEncodingDetector.java | 104 +++++
.../charsets/ReplacementCharset.java | 65 +++
.../charsets/XUserDefinedCharset.java | 57 +++
....java => StandardHtmlEncodingDetectorTest.java} | 100 ++++-
9 files changed, 868 insertions(+), 500 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
deleted file mode 100644
index 487f747..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import org.apache.tika.detect.EncodingDetector;
-import org.apache.tika.metadata.Metadata;
-
-import java.io.*;
-import java.nio.charset.Charset;
-import java.nio.charset.IllegalCharsetNameException;
-import java.nio.charset.StandardCharsets;
-import java.nio.charset.UnsupportedCharsetException;
-import java.util.*;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
-import java.util.stream.IntStream;
-
-import static java.nio.charset.StandardCharsets.*;
-import static org.apache.tika.parser.html.StrictHtmlEncodingDetector.SequenceMatcher.caseInsensitive;
-import static org.apache.tika.parser.html.StrictHtmlEncodingDetector.SingleByteMatcher.matchers;
-
-/**
- * This is a strict html encoding detector that enforces the standard
- * far more strictly than the HtmlEncodingDetector.
- */
-public class StrictHtmlEncodingDetector implements EncodingDetector {
- private static final String CHARSET_LABEL_FILE = "whatwg-encoding-labels.tsv";
- private static Map<String, Charset> CHARSET_LABELS = getCharsetLabels();
-
- private static Map<String, Charset> getCharsetLabels() {
- String path = StrictHtmlEncodingDetector.class.getPackage().getName().replace('.', '/');
- String filename = '/' + path + '/' + CHARSET_LABEL_FILE;
- InputStream inputStream = StrictHtmlEncodingDetector.class.getResourceAsStream(filename);
- Objects.requireNonNull(inputStream, "Missing charset label mapping file : " + filename);
- try (BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.US_ASCII))) {
- return buffer.lines()
- .filter(s -> !s.startsWith("#"))
- .map(s -> s.split("\t"))
- .filter(parts -> parts.length >= 2)
- .collect(Collectors.toMap(
- parts -> parts[0],
- StrictHtmlEncodingDetector::charsetFromStandard
- ));
- } catch (IOException e) {
- throw new UncheckedIOException("Unable to read the charset label mapping", e);
- }
- }
-
- private static Charset charsetFromStandard(String[] names) {
- for (int i = 1; i < names.length; i++) {
- try {
- return Charset.forName(names[1]);
- } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {/* pass */}
- }
- // The only single-byte charset extended charset that must be present on every Java platform
- return StandardCharsets.ISO_8859_1;
- }
-
- private static Charset getCharsetByLabel(String label) {
- if (label == null) return null;
- label = label.trim().toLowerCase(Locale.US);
- return CHARSET_LABELS.get(label);
- }
-
- @Override
- public Charset detect(InputStream input, Metadata metadata) throws IOException {
- PreScanner preScanner = new PreScanner(input);
-
- // If there is a BOM at the beginning, the detection does not go further
- Charset bomCharset = preScanner.detectBOM();
- if (bomCharset != null) return bomCharset;
-
- // Assume that if there was a charset specified either by the end user or the transport level,
- // it was stored in the metadata
- String incomingCharsetName = metadata.get(Metadata.CONTENT_ENCODING);
- if (incomingCharsetName != null) {
- Charset incomingCharset = getCharsetByLabel(incomingCharsetName);
- if (incomingCharset != null) return incomingCharset;
- }
-
- return preScanner.scan();
- }
-
- static class PreScanner {
-
- private static final Pattern META_CHARSET_PATTERN = Pattern.compile("charset\\s*=\\s*([\"']?)([^\"'\\s;]+)\\1");
- private static ByteMatcher COMMENT_START = new SequenceMatcher("<!--");
- private static ByteMatcher COMMENT_END = new SequenceMatcher("-->");
- private static ByteMatcher LETTER = new OrMatcher(
- new RangeMatcher((byte) 'a', (byte) 'z'),
- new RangeMatcher((byte) 'A', (byte) 'Z')
- );
- private static ByteMatcher SPACE = new OrMatcher(matchers(0x09, 0x0A, 0x0C, 0x0D, 0x20));
- private static ByteMatcher SLASH = new SingleByteMatcher((byte) '/');
- private static ByteMatcher EQUAL = new SingleByteMatcher((byte) '=');
- private static ByteMatcher TAG_END = new SingleByteMatcher((byte) '>');
- private static ByteMatcher SINGLE_QUOTE = new SingleByteMatcher((byte) '\'');
- private static ByteMatcher DOUBLE_QUOTE = new SingleByteMatcher((byte) '"');
- private static ByteMatcher QUOTE = new OrMatcher(SINGLE_QUOTE, DOUBLE_QUOTE);
- private static ByteMatcher TAG_END_OR_SLASH = new OrMatcher(SLASH, TAG_END);
- private static ByteMatcher SPACE_OR_SLASH = new OrMatcher(SPACE, SLASH);
- private static ByteMatcher SPACE_OR_TAG_END = new OrMatcher(SPACE, TAG_END);
- private static ByteMatcher META_START = new SequenceMatcher(caseInsensitive("<meta"), SPACE_OR_SLASH);
- private static ByteMatcher TAG_START = new SequenceMatcher(
- new SingleByteMatcher((byte) '<'),
- new OrMatcher(SLASH, LETTER)
- );
- private static ByteMatcher TAG_BODY = new NegativeMatcher(new OrMatcher(SPACE, TAG_END));
- private static ByteMatcher SPECIAL_TAG_START = new SequenceMatcher(
- new SingleByteMatcher((byte) '<'),
- new OrMatcher(matchers("!/?"))
- );
- private static ByteMatcher UTF8_BOM = new SequenceMatcher(matchers(0xEF, 0xBB, 0xBF));
- private static ByteMatcher UTF16_BE_BOM = new SequenceMatcher(matchers(0xFE, 0xFF));
- private static ByteMatcher UTF16_LE_BOM = new SequenceMatcher(matchers(0xFF, 0xFE));
-
-
- PushbackInputStream stream;
- private CharsetDetectionResult detectedCharset = new CharsetDetectionResult();
-
- public PreScanner(InputStream inputStream) {
- this.stream = new PushbackInputStream(inputStream, 32);
- }
-
- public Charset scan() {
- while (processAtLeastOneByte()) {
- if (detectedCharset.isFound()) {
- return detectedCharset.getCharset();
- }
- }
- return null;
- }
-
- private Charset detectBOM() {
- try {
- if (UTF8_BOM.matches(stream)) return StandardCharsets.UTF_8;
- else if (UTF16_BE_BOM.matches(stream)) return StandardCharsets.UTF_16BE;
- else if (UTF16_LE_BOM.matches(stream)) return StandardCharsets.UTF_16LE;
- } catch (IOException e) { /* stream could not be read, also return null */ }
- return null;
- }
-
- private boolean processAtLeastOneByte() {
- try {
- return processComment() ||
- processMeta() ||
- processTag() ||
- processSpecialTag() ||
- processAny();
- } catch (IOException e) {
- return false;
- }
- }
-
- private boolean processAny() throws IOException {
- int read = stream.read();
- return read != -1;
- }
-
- private boolean hasBytes() throws IOException {
- int read = stream.read();
- if (read != -1) stream.unread(read);
- return read != -1;
- }
-
- private boolean processComment() throws IOException {
- if (COMMENT_START.matches(stream)) {
- // The two '-' in the '-->' sequence can be the same as those in the '<!--' sequence.
- stream.unread("--".getBytes(StandardCharsets.US_ASCII));
- return COMMENT_END.advanceUntilMatches(stream);
- }
- return false;
- }
-
- private boolean processTag() throws IOException {
- if (TAG_START.matches(stream)) {
- TAG_BODY.skipAll(stream);
- while (getAttribute() != null) {/*ignore the attribute*/}
- return true;
- }
- return false;
- }
-
- private boolean processSpecialTag() throws IOException {
- if (SPECIAL_TAG_START.matches(stream)) {
- TAG_BODY.skipAll(stream);
- return TAG_END.advanceUntilMatches(stream);
- }
- return false;
- }
-
- private boolean processMeta() throws IOException {
- if (META_START.matches(stream)) {
- Set<String> attributeNames = new HashSet<>();
- boolean gotPragma = false;
- Boolean needPragma = null;
- CharsetDetectionResult charset = new CharsetDetectionResult();
- while (hasBytes()) {
- Attribute attribute = getAttribute();
- if (attribute == null) break;
- if (attributeNames.contains(attribute.getName())) continue;
- attributeNames.add(attribute.getName());
- switch (attribute.getName()) {
- case "http-equiv":
- if (attribute.getValue().equals("content-type"))
- gotPragma = true;
- break;
- case "content":
- String charsetName = getEncodingFromMeta(attribute.getValue());
- if (!charset.isFound() && charsetName != null) {
- charset.find(charsetName);
- needPragma = true;
- }
- break;
- case "charset":
- charset.find(attribute.getValue());
- needPragma = false;
- break;
- default: // Ignore non-charset related attributes
- }
- }
- if (needPragma != null && !(needPragma && !gotPragma)) {
- detectedCharset = charset;
- return true;
- }
- }
- return false;
- }
-
- private String getEncodingFromMeta(String attributeValue) {
- Matcher matcher = META_CHARSET_PATTERN.matcher(attributeValue);
- if (!matcher.find()) return null;
- return matcher.group(2);
- }
-
- private Attribute getAttribute() throws IOException {
- SPACE_OR_SLASH.skipAll(stream);
- if (TAG_END.peekMatches(stream)) return null;
- StringBuilder name = new StringBuilder();
- while (!EQUAL.peekMatches(stream) || name.length() == 0) {
- if (TAG_END_OR_SLASH.peekMatches(stream)) {
- break;
- } else if (SPACE.peekMatches(stream)) {
- SPACE.skipAll(stream);
- break;
- } else {
- name.append(getLowerCaseChar());
- }
- }
-
- if (!EQUAL.matches(stream)) return new Attribute(name.toString(), "");
- SPACE.skipAll(stream);
-
- StringBuilder value = new StringBuilder();
- byte[] quoteMatched = QUOTE.match(stream);
- if (quoteMatched != null) {
- char quote = (char) quoteMatched[0];
- int nextChar = -1;
- while (nextChar != quote) {
- if (nextChar != -1) value.append((char) nextChar);
- nextChar = getLowerCaseChar();
- }
- } else {
- while (!SPACE_OR_TAG_END.peekMatches(stream)) {
- value.append(getLowerCaseChar());
- }
- }
- return new Attribute(name.toString(), value.toString());
- }
-
- private char getLowerCaseChar() throws IOException {
- int nextPoint = stream.read();
- if (nextPoint == -1) throw new IOException();
- if (nextPoint >= 'A' && nextPoint <= 'Z') nextPoint += 0x20; // lowercase
- return (char) nextPoint;
- }
- }
-
- static class Attribute {
- String name;
- String value;
-
- public Attribute(String name, String value) {
- this.name = name;
- this.value = value;
- }
-
- public String getName() {
- return name;
- }
-
- public String getValue() {
- return value;
- }
- }
-
- /**
- * A detection may either not find a charset, find an invalid charset, or find a valid charset
- */
- static class CharsetDetectionResult {
- private boolean found = false;
- private Charset charset = null;
-
- public CharsetDetectionResult() { /* default result: not found */}
-
- public boolean isFound() {
- return found;
- }
-
- public void find(String charsetName) {
- this.found = true;
- charsetName = charsetName.trim();
- if ("x-user-defined".equals(charsetName)) charsetName = "windows-1252";
- this.charset = getCharsetByLabel(charsetName);
- // The specification states: If charset is a UTF-16 encoding, then set charset to UTF-8.
- if (UTF_16LE.equals(charset) || UTF_16BE.equals(charset)) charset = UTF_8;
- }
-
- public Charset getCharset() {
- // the result may be null even if found is true, in the case there is a charset specified,
- // but it is invalid
- return charset;
- }
- }
-
- static abstract class ByteMatcher {
-
- abstract byte[] match(PushbackInputStream pushbackInputStream) throws IOException;
-
- boolean matches(PushbackInputStream pushbackInputStream) throws IOException {
- return this.match(pushbackInputStream) != null;
- }
-
- boolean advanceUntilMatches(PushbackInputStream pushbackInputStream) throws IOException {
- while (!this.matches(pushbackInputStream)) {
- int nextByte = pushbackInputStream.read();
- if (nextByte == -1) return false;
- }
- return true;
- }
-
- void skipAll(PushbackInputStream pushbackInputStream) throws IOException {
- while (matches(pushbackInputStream)) {/* just skip the byte */}
- }
-
- public boolean peekMatches(PushbackInputStream pushbackInputStream) throws IOException {
- byte[] matched = this.match(pushbackInputStream);
- if (matched != null) pushbackInputStream.unread(matched);
- return matched != null;
- }
- }
-
- static class SingleByteMatcher extends ByteMatcher {
- private byte b;
-
- public SingleByteMatcher(byte b) {
- this.b = b;
- }
-
- public static ByteMatcher[] matchers(String s) {
- return matchers(s.chars());
- }
-
- public static ByteMatcher[] matchers(int... bytes) {
- return matchers(IntStream.of(bytes));
- }
-
- public static ByteMatcher[] matchers(IntStream byteStream) {
- return byteStream
- .mapToObj(i -> new SingleByteMatcher((byte) i))
- .toArray(ByteMatcher[]::new);
- }
-
- @Override
- byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
- int read = pushbackInputStream.read();
- if ((byte) read == b) return new byte[]{b};
- if (read != -1) pushbackInputStream.unread(read);
- return null;
- }
- }
-
- static class SequenceMatcher extends ByteMatcher {
- private ByteMatcher[] matchers;
-
- public SequenceMatcher(ByteMatcher... matchers) {
- this.matchers = matchers;
- }
-
- public SequenceMatcher(String s) {
- this(matchers(s));
- }
-
- public static SequenceMatcher caseInsensitive(String s) {
- ByteMatcher[] lowerMatchers = matchers(s.toLowerCase(Locale.US));
- ByteMatcher[] upperMatchers = matchers(s.toUpperCase(Locale.US));
- OrMatcher[] matchers = IntStream
- .range(0, Math.min(lowerMatchers.length, upperMatchers.length))
- .mapToObj(i -> new OrMatcher(lowerMatchers[i], upperMatchers[i]))
- .toArray(OrMatcher[]::new);
- return new SequenceMatcher(matchers);
- }
-
- @Override
- byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
- ByteArrayOutputStream allMatched = new ByteArrayOutputStream();
- for (ByteMatcher m : matchers) {
- byte[] matched = m.match(pushbackInputStream);
- if (matched == null) {
- pushbackInputStream.unread(allMatched.toByteArray());
- return null;
- } else {
- allMatched.write(matched);
- }
- }
- return allMatched.toByteArray();
- }
- }
-
- static class OrMatcher extends ByteMatcher {
- private ByteMatcher[] matchers;
-
- public OrMatcher(ByteMatcher... matchers) {
- this.matchers = matchers;
- }
-
- @Override
- byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
- for (ByteMatcher m : matchers) {
- byte[] matched = m.match(pushbackInputStream);
- if (matched != null) return matched;
- }
- return null;
- }
- }
-
- static class NegativeMatcher extends ByteMatcher {
- private ByteMatcher matcher;
-
- public NegativeMatcher(ByteMatcher matcher) {
- this.matcher = matcher;
- }
-
- @Override
- byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
- byte[] matched = matcher.match(pushbackInputStream);
- if (matched == null) {
- int read = pushbackInputStream.read();
- if (read == -1) return null;
- return new byte[]{(byte) read};
- } else {
- pushbackInputStream.unread(matched);
- return null;
- }
- }
- }
-
- static class RangeMatcher extends ByteMatcher {
- private byte low;
- private byte high;
-
- public RangeMatcher(byte low, byte high) {
- this.low = low;
- this.high = high;
- }
-
-
- @Override
- byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
- int read = pushbackInputStream.read();
- if (read >= low && read <= high) return new byte[]{(byte) read};
- if (read != -1) pushbackInputStream.unread(read);
- return null;
- }
- }
-}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java
new file mode 100644
index 0000000..4d4c7c2
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector;
+
+
+import org.apache.tika.parser.html.charsetdetector.charsets.ReplacementCharset;
+import org.apache.tika.parser.html.charsetdetector.charsets.XUserDefinedCharset;
+
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * Singleton class that associates standard charset names to java charset implementations
+ * https://encoding.spec.whatwg.org/#ref-for-iso-8859-8-i
+ */
+final class CharsetAliases {
+
+ private static final Map<String, Charset> charsetsByLabel = new HashMap<>();
+
+ private CharsetAliases() {
+ }
+
+ /**
+ * @param label a charset name
+ * @return the corresponding java charset, if there is one. Otherwise, null
+ */
+ static Charset getCharsetByLabel(String label) {
+ if (label == null) return null;
+ synchronized (charsetsByLabel) {
+ // Lazy initialization
+ if (charsetsByLabel.isEmpty()) addAll();
+ }
+ label = label.trim().toLowerCase(Locale.US);
+ return charsetsByLabel.get(label);
+ }
+
+ private static void addAll() {
+ addCharset(charset("Big5"), "big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5");
+ addCharset(charset("EUC-JP"), "cseucpkdfmtjapanese", "euc-jp", "x-euc-jp");
+ addCharset(charset("EUC-KR"), "cseuckr", "csksc56011987", "euc-kr", "iso-ir-149", "korean",
+ "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "windows-949");
+ addCharset(charset("GBK"), "chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312",
+ "gb_2312-80", "gbk", "iso-ir-58", "x-gbk");
+ addCharset(charset("IBM866"), "866", "cp866", "csibm866", "ibm866");
+ addCharset(charset("ISO-2022-JP"), "csiso2022jp", "iso-2022-jp");
+ addCharset(charset("ISO-8859-10", "ISO-8859-4"), "csisolatin6", "iso-8859-10", "iso-ir-157",
+ "iso8859-10", "iso885910", "l6", "latin6");
+ addCharset(charset("ISO-8859-13"), "iso-8859-13", "iso8859-13", "iso885913");
+ addCharset(charset("ISO-8859-14", "ISO-8859-1"), "iso-8859-14", "iso8859-14", "iso885914");
+ addCharset(charset("ISO-8859-15"), "csisolatin9", "iso-8859-15", "iso8859-15", "iso885915",
+ "iso_8859-15", "l9");
+ addCharset(charset("ISO-8859-16", "ISO-8859-1"), "iso-8859-16");
+ addCharset(charset("ISO-8859-2"), "csisolatin2", "iso-8859-2", "iso-ir-101", "iso8859-2",
+ "iso88592", "iso_8859-2", "iso_8859-2:1987", "l2", "latin2");
+ addCharset(charset("ISO-8859-3"), "csisolatin3", "iso-8859-3", "iso-ir-109", "iso8859-3",
+ "iso88593", "iso_8859-3", "iso_8859-3:1988", "l3", "latin3");
+ addCharset(charset("ISO-8859-4"), "csisolatin4", "iso-8859-4", "iso-ir-110", "iso8859-4",
+ "iso88594", "iso_8859-4", "iso_8859-4:1988", "l4", "latin4");
+ addCharset(charset("ISO-8859-5"), "csisolatincyrillic", "cyrillic", "iso-8859-5", "iso-ir-144",
+ "iso8859-5", "iso88595", "iso_8859-5", "iso_8859-5:1988");
+ addCharset(charset("ISO-8859-6"), "arabic", "asmo-708", "csiso88596e", "csiso88596i",
+ "csisolatinarabic", "ecma-114", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", "iso-ir-127",
+ "iso8859-6", "iso88596", "iso_8859-6", "iso_8859-6:1987");
+ addCharset(charset("ISO-8859-7"), "csisolatingreek", "ecma-118", "elot_928", "greek", "greek8",
+ "iso-8859-7", "iso-ir-126", "iso8859-7", "iso88597", "iso_8859-7", "iso_8859-7:1987", "sun_eu_greek");
+ // ISO-8859-8 actually should have an influence on the layout direction
+ // (text should be decoded in the visual order). However, this is not implemented in tika.
+ addCharset(charset("ISO-8859-8"), "csiso88598e", "csisolatinhebrew", "hebrew", "iso-8859-8",
+ "iso-8859-8-e", "iso-ir-138", "iso8859-8", "iso88598", "iso_8859-8", "iso_8859-8:1988", "visual");
+ addCharset(charset("ISO-8859-8-I", "ISO-8859-8"), "csiso88598i", "iso-8859-8-i", "logical");
+ addCharset(charset("KOI8-R"), "cskoi8r", "koi", "koi8", "koi8-r", "koi8_r");
+ addCharset(charset("KOI8-U"), "koi8-ru", "koi8-u");
+ addCharset(charset("Shift_JIS"), "csshiftjis", "ms932", "ms_kanji", "shift-jis", "shift_jis",
+ "sjis", "windows-31j", "x-sjis");
+ addCharset(charset("UTF-16BE"), "utf-16be");
+ addCharset(charset("UTF-16LE"), "utf-16", "utf-16le");
+ addCharset(charset("UTF-8"), "unicode-1-1-utf-8", "utf-8", "utf8");
+ addCharset(charset("gb18030"), "gb18030");
+ addCharset(charset("windows-1250"), "cp1250", "windows-1250", "x-cp1250");
+ addCharset(charset("windows-1251"), "cp1251", "windows-1251", "x-cp1251");
+ addCharset(charset("windows-1252"), "ansi_x3.4-1968", "ascii", "cp1252", "cp819", "csisolatin1",
+ "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", "iso_8859-1", "iso_8859-1:1987",
+ "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252");
+ addCharset(charset("windows-1253"), "cp1253", "windows-1253", "x-cp1253");
+ addCharset(charset("windows-1254"), "cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148",
+ "iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", "windows-1254", "x-cp1254");
+ addCharset(charset("windows-1255"), "cp1255", "windows-1255", "x-cp1255");
+ addCharset(charset("windows-1256"), "cp1256", "windows-1256", "x-cp1256");
+ addCharset(charset("windows-1257"), "cp1257", "windows-1257", "x-cp1257");
+ addCharset(charset("windows-1258"), "cp1258", "windows-1258", "x-cp1258");
+ addCharset(charset("windows-874"), "dos-874", "iso-8859-11", "iso8859-11", "iso885911",
+ "tis-620", "windows-874");
+ addCharset(charset("x-MacCyrillic"), "x-mac-cyrillic", "x-mac-ukrainian");
+ addCharset(charset("x-MacRoman"), "csmacintosh", "mac", "macintosh", "x-mac-roman");
+ // The "replacement" charset is a dummy charset. It is present to mitigate wrong-charset attacks
+ addCharset(new ReplacementCharset(), "csiso2022kr", "hz-gb-2312", "iso-2022-cn", "iso-2022-cn-ext",
+ "iso-2022-kr", "replacement");
+ // The x-user-defined charset is not present in java
+ addCharset(new XUserDefinedCharset(), "x-user-defined");
+ }
+
+ /**
+ * @param names jvm charset names
+ * @return the first of the given charsets that exists in the current JVM,
+ * or ISO_8859_1 if none exists
+ */
+ private static Charset charset(String... names) {
+ for (String name : names) {
+ try {
+ return Charset.forName(name);
+ } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {/* pass */}
+ }
+ // The only single-byte charset extended charset that must be present on every Java platform
+ return StandardCharsets.ISO_8859_1;
+ }
+
+ /**
+ * @param charset name of the charset in the JVM
+ * @param names standard W3C charset names
+ */
+ private static void addCharset(Charset charset, String... names) {
+ for (String name : names) {
+ charsetsByLabel.put(name, charset);
+ }
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java
new file mode 100644
index 0000000..0ba3637
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector;
+
+import java.nio.charset.Charset;
+
+import static java.nio.charset.StandardCharsets.UTF_16BE;
+import static java.nio.charset.StandardCharsets.UTF_16LE;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+
+/**
+ * A detection may either not find a charset, find an invalid charset, or find a valid charset
+ */
+class CharsetDetectionResult {
+ private boolean found = false;
+ private Charset charset = null;
+
+ private CharsetDetectionResult() { /* default result: not found */}
+
+ static CharsetDetectionResult notFound() {
+ return new CharsetDetectionResult();
+ }
+
+ public boolean isFound() {
+ return found;
+ }
+
+ public void find(String charsetName) {
+ this.found = true;
+ charsetName = charsetName.trim();
+ if ("x-user-defined".equals(charsetName)) charsetName = "windows-1252";
+ this.charset = CharsetAliases.getCharsetByLabel(charsetName);
+ // The specification states: If charset is a UTF-16 encoding, then set charset to UTF-8.
+ if (UTF_16LE.equals(charset) || UTF_16BE.equals(charset)) charset = UTF_8;
+ }
+
+ public Charset getCharset() {
+ // the result may be null even if found is true, in the case there is a charset specified,
+ // but it is invalid
+ return charset;
+ }
+
+ public void setCharset(Charset charset) {
+ this.found = true;
+ this.charset = charset;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/MetaProcessor.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/MetaProcessor.java
new file mode 100644
index 0000000..8ce250c
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/MetaProcessor.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector;
+
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import static org.apache.tika.parser.html.charsetdetector.PreScanner.getEncodingFromMeta;
+
+
+/**
+ * A class to process the attributes of an HTML meta tag in order to extract a character set.
+ * The user should repeatedly call {@link #processAttribute} on each attributes of the tag,
+ * then update its current detection result with {@link #updateDetectedCharset(CharsetDetectionResult)}
+ * <p>
+ * The algorithm implemented is meant to match the one described by the W3C here:
+ * https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
+ */
+class MetaProcessor {
+ private Set<String> attributeNames = new HashSet<>();
+ private boolean gotPragma = false;
+ private Boolean needPragma = null; // needPragma can be null, true, or false
+ private CharsetDetectionResult detectionResult = CharsetDetectionResult.notFound();
+
+ void updateDetectedCharset(CharsetDetectionResult currentDetectionResult) {
+ if (detectionResult.isFound() &&
+ needPragma != null &&
+ !(needPragma && !gotPragma)) {
+ currentDetectionResult.setCharset(detectionResult.getCharset());
+ }
+ }
+
+ void processAttribute(Map.Entry<String, String> attribute) {
+ // Ignore duplicate attributes
+ if (attributeNames.contains(attribute.getKey())) return;
+
+ attributeNames.add(attribute.getKey());
+
+ // Handle charset-related attributes
+ switch (attribute.getKey()) {
+ case "http-equiv":
+ if (attribute.getValue().equals("content-type"))
+ gotPragma = true;
+ break;
+ case "content":
+ String charsetName = getEncodingFromMeta(attribute.getValue());
+ if (!detectionResult.isFound() && charsetName != null) {
+ detectionResult.find(charsetName);
+ needPragma = true;
+ }
+ break;
+ case "charset":
+ detectionResult.find(attribute.getValue());
+ needPragma = false;
+ break;
+ default: // Ignore non-charset related attributes
+ }
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java
new file mode 100644
index 0000000..a00aeb1
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.AbstractMap;
+import java.util.BitSet;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * A scanner meant to detect charset meta tags in a byte stream
+ * See: https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
+ */
+class PreScanner {
+
+ private static final Pattern CHARSET_PATTERN = Pattern.compile("charset\\s*=\\s*([\"']?)([^\"'\\s;]+)\\1");
+ private static final byte[] COMMENT_START = {(byte) '<', (byte) '!', (byte) '-', (byte) '-'};
+ private static final byte[] COMMENT_END = {(byte) '-', (byte) '-', (byte) '>'};
+ private static final byte[] META_TAG_START = {(byte) '<', (byte) 'm', (byte) 'e', (byte) 't', (byte) 'a'};
+ private static final byte SLASH = (byte) '/';
+ private static final byte EQUAL = (byte) '=';
+ private static final byte TAG_START = (byte) '<';
+ private static final byte TAG_END = (byte) '>';
+ private static final BitSet QUOTE = bitSet('"', '\'');
+
+ private static final BitSet WHITESPACE = bitSet(0x09, 0x0A, 0x0C, 0x0D, 0x0D, 0x20);
+ private static final BitSet SPACE_OR_TAG_END = bitSet(WHITESPACE, TAG_END);
+ private static final BitSet SPACE_OR_SLASH = bitSet(WHITESPACE, SLASH);
+ private static final BitSet SPECIAL_TAGS = bitSet('!', '/', '?');
+
+ private static final byte[] UTF8_BOM = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
+ private static final byte[] UTF16_BE_BOM = {(byte) 0xFE, (byte) 0xFF};
+ private static final byte[] UTF16_LE_BOM = {(byte) 0xFF, (byte) 0xFE};
+ private static final byte LOWER_A = (byte) 'a';
+ private static final byte LOWER_Z = (byte) 'z';
+ private static final byte UPPER_A = (byte) 'A';
+ private static final byte UPPER_Z = (byte) 'Z';
+ private BufferedInputStream stream;
+ private CharsetDetectionResult detectedCharset = CharsetDetectionResult.notFound();
+
+ PreScanner(InputStream inputStream) {
+ this.stream = new BufferedInputStream(inputStream);
+ }
+
+ private static BitSet bitSet(int... bs) {
+ BitSet bitSet = new BitSet(0xFF);
+ for (int b : bs) bitSet.set(b);
+ return bitSet;
+ }
+
+ private static BitSet bitSet(BitSet base, int... bs) {
+ BitSet bitSet = (BitSet) base.clone();
+ for (int b : bs) bitSet.set(b);
+ return bitSet;
+ }
+
+ static String getEncodingFromMeta(String attributeValue) {
+ Matcher matcher = CHARSET_PATTERN.matcher(attributeValue);
+ if (!matcher.find()) return null;
+ return matcher.group(2);
+ }
+
+ private static boolean contains(BitSet bitSet, byte b) {
+ return bitSet.get(b & 0xFF);
+ }
+
+ Charset scan() {
+ while (processAtLeastOneByte()) {
+ if (detectedCharset.isFound()) {
+ return detectedCharset.getCharset();
+ }
+ }
+ return null;
+ }
+
+ Charset detectBOM() {
+ try {
+ if (expect(UTF8_BOM)) return StandardCharsets.UTF_8;
+ else if (expect(UTF16_BE_BOM)) return StandardCharsets.UTF_16BE;
+ else if (expect(UTF16_LE_BOM)) return StandardCharsets.UTF_16LE;
+ } catch (IOException e) { /* stream could not be read, also return null */ }
+ return null;
+ }
+
+ private boolean processAtLeastOneByte() {
+ try {
+ return processComment() ||
+ processMeta() ||
+ processTag() ||
+ processSpecialTag() ||
+ processAny();
+ } catch (IOException e) {
+ return false;
+ }
+ }
+
+ private boolean processAny() throws IOException {
+ int read = stream.read();
+ return read != -1;
+ }
+
+ private boolean processTag() throws IOException {
+ stream.mark(3);
+ if (read() == TAG_START) {
+ int read = stream.read();
+ if (read == SLASH) read = stream.read();
+ if ((LOWER_A <= read && read <= LOWER_Z) ||
+ (UPPER_A <= read && read <= UPPER_Z)) {
+ do stream.mark(1);
+ while (!contains(SPACE_OR_TAG_END, read()));
+ stream.reset();
+ while (getAttribute() != null) {/* ignore the attribute*/}
+ return true;
+ }
+ }
+ stream.reset();
+ return false;
+ }
+
+ private boolean processSpecialTag() throws IOException {
+ stream.mark(2);
+ if (read() == TAG_START && contains(SPECIAL_TAGS, read())) {
+ skipUntil(TAG_END);
+ return true;
+ }
+ stream.reset();
+ return false;
+ }
+
+ private boolean processMeta() throws IOException {
+ stream.mark(6); // len("<meta ") == 6
+ if (readCaseInsensitive(META_TAG_START) && contains(SPACE_OR_SLASH, read())) {
+ MetaProcessor metaProcessor = new MetaProcessor();
+ for (Map.Entry<String, String> attribute = getAttribute(); attribute != null; attribute = getAttribute()) {
+ metaProcessor.processAttribute(attribute);
+ }
+ metaProcessor.updateDetectedCharset(detectedCharset);
+ return true;
+ }
+ stream.reset();
+ return false;
+ }
+
+ /**
+ * Read an attribute from the stream
+ *
+ * @return the attribute as a Map.Entry, where the key is the attribute's name and
+ * the value is the attribute's value. If there is no attribute, return null
+ */
+ private Map.Entry<String, String> getAttribute() throws IOException {
+ String name = getAttributeName();
+ if (name == null) return null;
+
+ if (!expect(EQUAL)) return new AbstractMap.SimpleEntry<>(name, "");
+ skipAll(WHITESPACE);
+
+ String value = getAttributeValue();
+ return new AbstractMap.SimpleEntry<>(name, value);
+ }
+
+ private String getAttributeName() throws IOException {
+ skipAll(SPACE_OR_SLASH);
+ if (expect(TAG_END)) return null;
+ StringBuilder name = new StringBuilder();
+ while (!(peek() == EQUAL && name.length() > 0) &&
+ !(peek() == TAG_END || peek() == SLASH) &&
+ !skipAll(WHITESPACE)) {
+ name.append((char) getLowerCaseChar());
+ }
+ return name.toString();
+ }
+
+ private String getAttributeValue() throws IOException {
+ StringBuilder value = new StringBuilder();
+ stream.mark(1);
+ byte quote = read();
+ if (contains(QUOTE, quote)) {
+ for (byte b = getLowerCaseChar(); b != quote; b = getLowerCaseChar()) {
+ value.append((char) b);
+ }
+ } else {
+ stream.reset();
+ for (byte b = getLowerCaseChar(); !contains(SPACE_OR_TAG_END, b); b = getLowerCaseChar()) {
+ value.append((char) b);
+ stream.mark(1);
+ }
+ stream.reset(); // unread the space or tag end
+ }
+ return value.toString();
+ }
+
+ private boolean skipAll(BitSet bitSet) throws IOException {
+ boolean skipped = false;
+ stream.mark(1);
+ for (byte read = read(); contains(bitSet, read); read = read()) {
+ skipped = true;
+ stream.mark(1);
+ }
+ stream.reset();
+ return skipped;
+ }
+
+ private byte getLowerCaseChar() throws IOException {
+ byte nextPoint = read();
+ if (nextPoint >= 'A' && nextPoint <= 'Z') nextPoint += 0x20; // lowercase
+ return nextPoint;
+ }
+
+ private boolean processComment() throws IOException {
+ if (!expect(COMMENT_START)) return false;
+ if (!expect(TAG_END)) skipUntil(COMMENT_END);
+ return true;
+ }
+
+ private boolean expect(byte... expected) throws IOException {
+ stream.mark(expected.length);
+ for (byte b : expected) {
+ byte read = read();
+ if (read != b) {
+ stream.reset();
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private void skipUntil(byte... expected) throws IOException {
+ while (!expect(expected)) {
+ if (stream.read() == -1) return;
+ }
+ }
+
+ private boolean readCaseInsensitive(byte... bs) throws IOException {
+ for (byte b : bs) if (getLowerCaseChar() != b) return false;
+ return true;
+ }
+
+ private byte read() throws IOException {
+ int r = stream.read();
+ if (r == -1) throw new IOException();
+ return (byte) r;
+ }
+
+ private byte peek() throws IOException {
+ stream.mark(1);
+ byte b = read();
+ stream.reset();
+ return b;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java
new file mode 100644
index 0000000..f9d1a1b
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector;
+
+import org.apache.commons.io.input.BoundedInputStream;
+import org.apache.tika.config.Field;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+
+import static org.apache.tika.parser.html.charsetdetector.CharsetAliases.getCharsetByLabel;
+
+/**
+ * An encoding detector that tries to respect the spirit of the HTML spec
+ * part 12.2.3 "The input byte stream", or at least the part that is compatible with
+ * the implementation of tika.
+ * <p>
+ * https://html.spec.whatwg.org/multipage/parsing.html#the-input-byte-stream
+ * <p>
+ * If a resource was fetched over HTTP, then HTTP headers should be added to tika metadata
+ * when using {@link #detect}, especially {@link Metadata#CONTENT_TYPE}, as it may contain charset information.
+ * <p>
+ * This encoding detector may return null if no encoding is detected.
+ * It is meant to be used inside a {@link org.apache.tika.detect.CompositeDetector}.
+ * For instance:
+ * <pre> {@code
+ * EncodingDetector detector = new CompositeDetector(
+ * new StandardHtmlEncodingDetector(),
+ * new Icu4jEncodingDetector()
+ * );
+ * }</pre>
+ * <p>
+ */
+public final class StandardHtmlEncodingDetector implements EncodingDetector {
+ private static final int META_TAG_BUFFER_SIZE = 8192;
+
+ @Field
+ private int markLimit = META_TAG_BUFFER_SIZE;
+
+ /**
+ * Extracts a charset from a Content-Type HTTP header.
+ *
+ * @param metadata parser metadata
+ * @return a charset if there is one specified, or null
+ */
+ private static Charset charsetFromContentType(Metadata metadata) {
+ String contentType = metadata.get(Metadata.CONTENT_TYPE);
+ MediaType mediatype = MediaType.parse(contentType);
+ if (mediatype == null) return null;
+ String charsetLabel = mediatype.getParameters().get("charset");
+ return getCharsetByLabel(charsetLabel);
+ }
+
+ @Override
+ public Charset detect(InputStream input, Metadata metadata) throws IOException {
+ int limit = getMarkLimit();
+ input.mark(limit);
+ // Never read more than the first META_TAG_BUFFER_SIZE bytes
+ InputStream limitedStream = new BoundedInputStream(input, limit);
+ PreScanner preScanner = new PreScanner(limitedStream);
+
+ // The order of priority for detection is:
+ // 1. Byte Order Mark
+ Charset detectedCharset = preScanner.detectBOM();
+ // 2. Transport-level information (Content-Type HTTP header)
+ if (detectedCharset == null) detectedCharset = charsetFromContentType(metadata);
+ // 3. HTML <meta> tag
+ if (detectedCharset == null) detectedCharset = preScanner.scan();
+
+ input.reset();
+ return detectedCharset;
+ }
+
+ public int getMarkLimit() {
+ return markLimit;
+ }
+
+ /**
+ * How far into the stream to read for charset detection.
+ * Default is 8192.
+ */
+ @Field
+ public void setMarkLimit(int markLimit) {
+ this.markLimit = markLimit;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java
new file mode 100644
index 0000000..32b96cf
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector.charsets;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+
+/**
+ * An implementation of the standard "replacement" charset defined by the W3C.
+ * See: https://encoding.spec.whatwg.org/#replacement
+ */
+public class ReplacementCharset extends Charset {
+
+ public ReplacementCharset() {
+ super("replacement", null);
+ }
+
+ @Override
+ public boolean contains(Charset cs) {
+ return cs.equals(this);
+ }
+
+ public CharsetDecoder newDecoder() {
+ return new CharsetDecoder(this, Float.MIN_VALUE, 1) {
+ private boolean replacementErrorReturned = false;
+
+ @Override
+ protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
+ if (in.hasRemaining() && !replacementErrorReturned) {
+ replacementErrorReturned = true;
+ return CoderResult.malformedForLength(in.remaining());
+ }
+ in.position(in.limit());
+ return CoderResult.UNDERFLOW;
+ }
+
+ @Override
+ protected void implReset() {
+ replacementErrorReturned = false;
+ }
+ };
+ }
+
+ public CharsetEncoder newEncoder() {
+ throw new UnsupportedOperationException("This charset does not support encoding");
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/XUserDefinedCharset.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/XUserDefinedCharset.java
new file mode 100644
index 0000000..650694a
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/XUserDefinedCharset.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector.charsets;
+
+import org.apache.commons.lang.NotImplementedException;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.StandardCharsets;
+
+public class XUserDefinedCharset extends Charset {
+
+ public XUserDefinedCharset() {
+ super("x-user-defined", null);
+ }
+
+ @Override
+ public boolean contains(Charset cs) {
+ return cs.equals(StandardCharsets.US_ASCII);
+ }
+
+ public CharsetDecoder newDecoder() {
+ return new CharsetDecoder(this, 1, 1) {
+ @Override
+ protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
+ while (true) {
+ if (!in.hasRemaining()) return CoderResult.UNDERFLOW;
+ if (!out.hasRemaining()) return CoderResult.OVERFLOW;
+ byte b = in.get();
+ out.append((char) ((b >= 0) ? b : 0xF700 + (b & 0xFF)));
+ }
+ }
+ };
+ }
+
+ public CharsetEncoder newEncoder() {
+ throw new NotImplementedException("Encoding to x-user-defined is not implemented");
+ }
+}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
similarity index 77%
rename from tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java
rename to tika-parsers/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
index 1c0da8d..4311887 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
@@ -19,17 +19,23 @@ package org.apache.tika.parser.html;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector;
+import org.apache.tika.parser.html.charsetdetector.charsets.ReplacementCharset;
import org.junit.Before;
-import org.junit.Ignore;
import org.junit.Test;
-import java.io.*;
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.SequenceInputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
+import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
-public class StrictHtmlEncodingDetectorTest {
+public class StandardHtmlEncodingDetectorTest {
private Metadata metadata = new Metadata();
@Before
@@ -39,6 +45,11 @@ public class StrictHtmlEncodingDetectorTest {
@Test
public void basic() throws IOException {
+ assertWindows1252("<meta charset=WINDOWS-1252>");
+ }
+
+ @Test
+ public void quoted() throws IOException {
assertWindows1252("<meta charset='WINDOWS-1252'>");
}
@@ -49,6 +60,22 @@ public class StrictHtmlEncodingDetectorTest {
}
@Test
+ public void duplicateAttribute() throws IOException {
+ assertWindows1252("<meta charset='WINDOWS-1252' charset='UTF-8'>");
+ }
+
+ @Test
+ public void invalidThenValid() throws IOException {
+ assertCharset("<meta charset=blah>" +
+ "<meta charset=WINDOWS-1252>", null);
+ }
+
+ @Test
+ public void spacesInAttributes() throws IOException {
+ assertWindows1252("<meta charset\u000C= \t WINDOWS-1252>");
+ }
+
+ @Test
public void httpEquiv() throws IOException {
assertWindows1252("<meta " +
"http-equiv='content-type' " +
@@ -59,6 +86,11 @@ public class StrictHtmlEncodingDetectorTest {
}
@Test
+ public void emptyAttributeEnd() throws IOException {
+ assertWindows1252("<meta charset=WINDOWS-1252 a>");
+ }
+
+ @Test
public void httpEquivDuplicateCharset() throws IOException {
assertWindows1252("<meta " +
"http-equiv='content-type' " +
@@ -72,19 +104,50 @@ public class StrictHtmlEncodingDetectorTest {
}
@Test
- public void verBadHtml() throws IOException {
+ public void veryBadHtml() throws IOException {
// check that the parser is not confused by garbage before the declaration
assertWindows1252("<< l \" == / '=x\n >" +
"<!--> " +
"< <x'/ <=> " +
"<meta/>" +
+ "<meta>" +
"<a x/>" +
"<meta charset='WINDOWS-1252'>");
}
@Test
+ public void specialTag() throws IOException {
+ // special tags cannot have arguments, any '>' ends them
+ assertWindows1252("<? x='><meta charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ public void longHtml() throws IOException {
+ StringBuilder sb = new StringBuilder("<!doctype html>\n" +
+ "<html>\n" +
+ "<head>\n" +
+ "<title>Hello world</title>\n");
+ String repeated = "<meta x='y' />\n";
+ String charsetMeta = "<meta charset='windows-1252'>";
+
+ while (sb.length() + repeated.length() + charsetMeta.length() < 1024) sb.append(repeated);
+
+ sb.append(charsetMeta);
+
+ assertWindows1252(sb.toString());
+ }
+
+ @Test
+ public void tooLong() throws IOException {
+ // Create a string with 1Mb of '\0' followed by a meta
+ String padded = new String(new byte[1000000]) + "<meta charset='windows-1252'>";
+ // Only the first bytes should be prescanned, so the algorithm should stop before the meta tag
+ assertCharset(padded, null);
+ }
+
+ @Test
public void incompleteMeta() throws IOException {
- assertWindows1252("<meta charset='WINDOWS-1252'"); // missing '>' at the end
+ assertCharset("<meta charset='WINDOWS-1252'", null); // missing '>' at the end
}
@Test
@@ -110,6 +173,13 @@ public class StrictHtmlEncodingDetectorTest {
}
@Test
+ public void replacement() throws IOException {
+ // Several dangerous charsets should are aliases of 'replacement' in the spec
+ String inString = "<meta charset='iso-2022-cn'>";
+ assertCharset(new ByteArrayInputStream(inString.getBytes()), new ReplacementCharset());
+ }
+
+ @Test
public void iso88591() throws IOException {
// In the spec, iso-8859-1 is an alias for WINDOWS-1252
assertWindows1252("<meta charset='iso-8859-1'>");
@@ -226,8 +296,8 @@ public class StrictHtmlEncodingDetectorTest {
}
@Test
- public void withUserProvidedCharset() throws IOException {
- metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
+ public void withCharsetInContentType() throws IOException {
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; Charset=ISO-8859-1");
// ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level
assertWindows1252("");
assertWindows1252("<meta charset='UTF-8'>");
@@ -262,6 +332,18 @@ public class StrictHtmlEncodingDetectorTest {
assertCharset(throwAfter("<!doctype html><html attr='x"), null);
}
+ @Test
+ public void streamReset() throws IOException {
+ // The stream should be reset after detection
+ byte[] inBytes = {0,1,2,3,4};
+ byte[] outBytes = new byte[5];
+ InputStream inStream = new ByteArrayInputStream(inBytes);
+ detectCharset(inStream);
+ // The stream should still be readable from the beginning after detection
+ inStream.read(outBytes);
+ assertArrayEquals(inBytes, outBytes);
+ }
+
private void assertWindows1252(String html) throws IOException {
assertCharset(html, Charset.forName("WINDOWS-1252"));
}
@@ -283,7 +365,7 @@ public class StrictHtmlEncodingDetectorTest {
}
private Charset detectCharset(InputStream inStream) throws IOException {
- return new StrictHtmlEncodingDetector().detect(inStream, metadata);
+ return new StandardHtmlEncodingDetector().detect(inStream, metadata);
}
private InputStream throwAfter(String html) {
@@ -295,6 +377,6 @@ public class StrictHtmlEncodingDetectorTest {
throw new IOException("test exception");
}
};
- return new SequenceInputStream(contentsInStream, errorThrowing);
+ return new BufferedInputStream(new SequenceInputStream(contentsInStream, errorThrowing));
}
}