You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/03 15:32:59 UTC

[tika] branch branch_1x updated: TIKA-2673 -- add StandardHtmlEncodingDetector via Gerard Bouchar

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 6badaea  TIKA-2673 -- add StandardHtmlEncodingDetector via Gerard Bouchar
6badaea is described below

commit 6badaead79e3350414536a5e4972871f66e97e90
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Aug 3 11:32:49 2018 -0400

    TIKA-2673 -- add StandardHtmlEncodingDetector via Gerard Bouchar
---
 .../parser/html/StrictHtmlEncodingDetector.java    | 491 ---------------------
 .../html/charsetdetector/CharsetAliases.java       | 145 ++++++
 .../charsetdetector/CharsetDetectionResult.java    |  62 +++
 .../parser/html/charsetdetector/MetaProcessor.java |  74 ++++
 .../parser/html/charsetdetector/PreScanner.java    | 270 +++++++++++
 .../StandardHtmlEncodingDetector.java              | 104 +++++
 .../charsets/ReplacementCharset.java               |  65 +++
 .../charsets/XUserDefinedCharset.java              |  57 +++
 ....java => StandardHtmlEncodingDetectorTest.java} | 100 ++++-
 9 files changed, 868 insertions(+), 500 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
deleted file mode 100644
index 487f747..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import org.apache.tika.detect.EncodingDetector;
-import org.apache.tika.metadata.Metadata;
-
-import java.io.*;
-import java.nio.charset.Charset;
-import java.nio.charset.IllegalCharsetNameException;
-import java.nio.charset.StandardCharsets;
-import java.nio.charset.UnsupportedCharsetException;
-import java.util.*;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
-import java.util.stream.IntStream;
-
-import static java.nio.charset.StandardCharsets.*;
-import static org.apache.tika.parser.html.StrictHtmlEncodingDetector.SequenceMatcher.caseInsensitive;
-import static org.apache.tika.parser.html.StrictHtmlEncodingDetector.SingleByteMatcher.matchers;
-
-/**
- * This is a strict html encoding detector that enforces the standard
- * far more strictly than the HtmlEncodingDetector.
- */
-public class StrictHtmlEncodingDetector implements EncodingDetector {
-    private static final String CHARSET_LABEL_FILE = "whatwg-encoding-labels.tsv";
-    private static Map<String, Charset> CHARSET_LABELS = getCharsetLabels();
-
-    private static Map<String, Charset> getCharsetLabels() {
-        String path = StrictHtmlEncodingDetector.class.getPackage().getName().replace('.', '/');
-        String filename = '/' + path + '/' + CHARSET_LABEL_FILE;
-        InputStream inputStream = StrictHtmlEncodingDetector.class.getResourceAsStream(filename);
-        Objects.requireNonNull(inputStream, "Missing charset label mapping file : " + filename);
-        try (BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.US_ASCII))) {
-            return buffer.lines()
-                    .filter(s -> !s.startsWith("#"))
-                    .map(s -> s.split("\t"))
-                    .filter(parts -> parts.length >= 2)
-                    .collect(Collectors.toMap(
-                            parts -> parts[0],
-                            StrictHtmlEncodingDetector::charsetFromStandard
-                    ));
-        } catch (IOException e) {
-            throw new UncheckedIOException("Unable to read the charset label mapping", e);
-        }
-    }
-
-    private static Charset charsetFromStandard(String[] names) {
-        for (int i = 1; i < names.length; i++) {
-            try {
-                return Charset.forName(names[1]);
-            } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {/* pass */}
-        }
-        // The only single-byte charset extended charset that must be present on every Java platform
-        return StandardCharsets.ISO_8859_1;
-    }
-
-    private static Charset getCharsetByLabel(String label) {
-        if (label == null) return null;
-        label = label.trim().toLowerCase(Locale.US);
-        return CHARSET_LABELS.get(label);
-    }
-
-    @Override
-    public Charset detect(InputStream input, Metadata metadata) throws IOException {
-        PreScanner preScanner = new PreScanner(input);
-
-        // If there is a BOM at the beginning, the detection does not go further
-        Charset bomCharset = preScanner.detectBOM();
-        if (bomCharset != null) return bomCharset;
-
-        // Assume that if there was a charset specified either by the end user or the transport level,
-        // it was stored in the metadata
-        String incomingCharsetName = metadata.get(Metadata.CONTENT_ENCODING);
-        if (incomingCharsetName != null) {
-            Charset incomingCharset = getCharsetByLabel(incomingCharsetName);
-            if (incomingCharset != null) return incomingCharset;
-        }
-
-        return preScanner.scan();
-    }
-
-    static class PreScanner {
-
-        private static final Pattern META_CHARSET_PATTERN = Pattern.compile("charset\\s*=\\s*([\"']?)([^\"'\\s;]+)\\1");
-        private static ByteMatcher COMMENT_START = new SequenceMatcher("<!--");
-        private static ByteMatcher COMMENT_END = new SequenceMatcher("-->");
-        private static ByteMatcher LETTER = new OrMatcher(
-                new RangeMatcher((byte) 'a', (byte) 'z'),
-                new RangeMatcher((byte) 'A', (byte) 'Z')
-        );
-        private static ByteMatcher SPACE = new OrMatcher(matchers(0x09, 0x0A, 0x0C, 0x0D, 0x20));
-        private static ByteMatcher SLASH = new SingleByteMatcher((byte) '/');
-        private static ByteMatcher EQUAL = new SingleByteMatcher((byte) '=');
-        private static ByteMatcher TAG_END = new SingleByteMatcher((byte) '>');
-        private static ByteMatcher SINGLE_QUOTE = new SingleByteMatcher((byte) '\'');
-        private static ByteMatcher DOUBLE_QUOTE = new SingleByteMatcher((byte) '"');
-        private static ByteMatcher QUOTE = new OrMatcher(SINGLE_QUOTE, DOUBLE_QUOTE);
-        private static ByteMatcher TAG_END_OR_SLASH = new OrMatcher(SLASH, TAG_END);
-        private static ByteMatcher SPACE_OR_SLASH = new OrMatcher(SPACE, SLASH);
-        private static ByteMatcher SPACE_OR_TAG_END = new OrMatcher(SPACE, TAG_END);
-        private static ByteMatcher META_START = new SequenceMatcher(caseInsensitive("<meta"), SPACE_OR_SLASH);
-        private static ByteMatcher TAG_START = new SequenceMatcher(
-                new SingleByteMatcher((byte) '<'),
-                new OrMatcher(SLASH, LETTER)
-        );
-        private static ByteMatcher TAG_BODY = new NegativeMatcher(new OrMatcher(SPACE, TAG_END));
-        private static ByteMatcher SPECIAL_TAG_START = new SequenceMatcher(
-                new SingleByteMatcher((byte) '<'),
-                new OrMatcher(matchers("!/?"))
-        );
-        private static ByteMatcher UTF8_BOM = new SequenceMatcher(matchers(0xEF, 0xBB, 0xBF));
-        private static ByteMatcher UTF16_BE_BOM = new SequenceMatcher(matchers(0xFE, 0xFF));
-        private static ByteMatcher UTF16_LE_BOM = new SequenceMatcher(matchers(0xFF, 0xFE));
-
-
-        PushbackInputStream stream;
-        private CharsetDetectionResult detectedCharset = new CharsetDetectionResult();
-
-        public PreScanner(InputStream inputStream) {
-            this.stream = new PushbackInputStream(inputStream, 32);
-        }
-
-        public Charset scan() {
-            while (processAtLeastOneByte()) {
-                if (detectedCharset.isFound()) {
-                    return detectedCharset.getCharset();
-                }
-            }
-            return null;
-        }
-
-        private Charset detectBOM() {
-            try {
-                if (UTF8_BOM.matches(stream)) return StandardCharsets.UTF_8;
-                else if (UTF16_BE_BOM.matches(stream)) return StandardCharsets.UTF_16BE;
-                else if (UTF16_LE_BOM.matches(stream)) return StandardCharsets.UTF_16LE;
-            } catch (IOException e) { /* stream could not be read, also return null */ }
-            return null;
-        }
-
-        private boolean processAtLeastOneByte() {
-            try {
-                return processComment() ||
-                        processMeta() ||
-                        processTag() ||
-                        processSpecialTag() ||
-                        processAny();
-            } catch (IOException e) {
-                return false;
-            }
-        }
-
-        private boolean processAny() throws IOException {
-            int read = stream.read();
-            return read != -1;
-        }
-
-        private boolean hasBytes() throws IOException {
-            int read = stream.read();
-            if (read != -1) stream.unread(read);
-            return read != -1;
-        }
-
-        private boolean processComment() throws IOException {
-            if (COMMENT_START.matches(stream)) {
-                // The two '-' in the '-->' sequence can be the same as those in the '<!--' sequence.
-                stream.unread("--".getBytes(StandardCharsets.US_ASCII));
-                return COMMENT_END.advanceUntilMatches(stream);
-            }
-            return false;
-        }
-
-        private boolean processTag() throws IOException {
-            if (TAG_START.matches(stream)) {
-                TAG_BODY.skipAll(stream);
-                while (getAttribute() != null) {/*ignore the attribute*/}
-                return true;
-            }
-            return false;
-        }
-
-        private boolean processSpecialTag() throws IOException {
-            if (SPECIAL_TAG_START.matches(stream)) {
-                TAG_BODY.skipAll(stream);
-                return TAG_END.advanceUntilMatches(stream);
-            }
-            return false;
-        }
-
-        private boolean processMeta() throws IOException {
-            if (META_START.matches(stream)) {
-                Set<String> attributeNames = new HashSet<>();
-                boolean gotPragma = false;
-                Boolean needPragma = null;
-                CharsetDetectionResult charset = new CharsetDetectionResult();
-                while (hasBytes()) {
-                    Attribute attribute = getAttribute();
-                    if (attribute == null) break;
-                    if (attributeNames.contains(attribute.getName())) continue;
-                    attributeNames.add(attribute.getName());
-                    switch (attribute.getName()) {
-                        case "http-equiv":
-                            if (attribute.getValue().equals("content-type"))
-                                gotPragma = true;
-                            break;
-                        case "content":
-                            String charsetName = getEncodingFromMeta(attribute.getValue());
-                            if (!charset.isFound() && charsetName != null) {
-                                charset.find(charsetName);
-                                needPragma = true;
-                            }
-                            break;
-                        case "charset":
-                            charset.find(attribute.getValue());
-                            needPragma = false;
-                            break;
-                        default: // Ignore non-charset related attributes
-                    }
-                }
-                if (needPragma != null && !(needPragma && !gotPragma)) {
-                    detectedCharset = charset;
-                    return true;
-                }
-            }
-            return false;
-        }
-
-        private String getEncodingFromMeta(String attributeValue) {
-            Matcher matcher = META_CHARSET_PATTERN.matcher(attributeValue);
-            if (!matcher.find()) return null;
-            return matcher.group(2);
-        }
-
-        private Attribute getAttribute() throws IOException {
-            SPACE_OR_SLASH.skipAll(stream);
-            if (TAG_END.peekMatches(stream)) return null;
-            StringBuilder name = new StringBuilder();
-            while (!EQUAL.peekMatches(stream) || name.length() == 0) {
-                if (TAG_END_OR_SLASH.peekMatches(stream)) {
-                    break;
-                } else if (SPACE.peekMatches(stream)) {
-                    SPACE.skipAll(stream);
-                    break;
-                } else {
-                    name.append(getLowerCaseChar());
-                }
-            }
-
-            if (!EQUAL.matches(stream)) return new Attribute(name.toString(), "");
-            SPACE.skipAll(stream);
-
-            StringBuilder value = new StringBuilder();
-            byte[] quoteMatched = QUOTE.match(stream);
-            if (quoteMatched != null) {
-                char quote = (char) quoteMatched[0];
-                int nextChar = -1;
-                while (nextChar != quote) {
-                    if (nextChar != -1) value.append((char) nextChar);
-                    nextChar = getLowerCaseChar();
-                }
-            } else {
-                while (!SPACE_OR_TAG_END.peekMatches(stream)) {
-                    value.append(getLowerCaseChar());
-                }
-            }
-            return new Attribute(name.toString(), value.toString());
-        }
-
-        private char getLowerCaseChar() throws IOException {
-            int nextPoint = stream.read();
-            if (nextPoint == -1) throw new IOException();
-            if (nextPoint >= 'A' && nextPoint <= 'Z') nextPoint += 0x20; // lowercase
-            return (char) nextPoint;
-        }
-    }
-
-    static class Attribute {
-        String name;
-        String value;
-
-        public Attribute(String name, String value) {
-            this.name = name;
-            this.value = value;
-        }
-
-        public String getName() {
-            return name;
-        }
-
-        public String getValue() {
-            return value;
-        }
-    }
-
-    /**
-     * A detection may either not find a charset, find an invalid charset, or find a valid charset
-     */
-    static class CharsetDetectionResult {
-        private boolean found = false;
-        private Charset charset = null;
-
-        public CharsetDetectionResult() { /* default result: not found */}
-
-        public boolean isFound() {
-            return found;
-        }
-
-        public void find(String charsetName) {
-            this.found = true;
-            charsetName = charsetName.trim();
-            if ("x-user-defined".equals(charsetName)) charsetName = "windows-1252";
-            this.charset = getCharsetByLabel(charsetName);
-            // The specification states: If charset is a UTF-16 encoding, then set charset to UTF-8.
-            if (UTF_16LE.equals(charset) || UTF_16BE.equals(charset)) charset = UTF_8;
-        }
-
-        public Charset getCharset() {
-            // the result may be null even if found is true, in the case there is a charset specified,
-            // but it is invalid
-            return charset;
-        }
-    }
-
-    static abstract class ByteMatcher {
-
-        abstract byte[] match(PushbackInputStream pushbackInputStream) throws IOException;
-
-        boolean matches(PushbackInputStream pushbackInputStream) throws IOException {
-            return this.match(pushbackInputStream) != null;
-        }
-
-        boolean advanceUntilMatches(PushbackInputStream pushbackInputStream) throws IOException {
-            while (!this.matches(pushbackInputStream)) {
-                int nextByte = pushbackInputStream.read();
-                if (nextByte == -1) return false;
-            }
-            return true;
-        }
-
-        void skipAll(PushbackInputStream pushbackInputStream) throws IOException {
-            while (matches(pushbackInputStream)) {/* just skip the byte */}
-        }
-
-        public boolean peekMatches(PushbackInputStream pushbackInputStream) throws IOException {
-            byte[] matched = this.match(pushbackInputStream);
-            if (matched != null) pushbackInputStream.unread(matched);
-            return matched != null;
-        }
-    }
-
-    static class SingleByteMatcher extends ByteMatcher {
-        private byte b;
-
-        public SingleByteMatcher(byte b) {
-            this.b = b;
-        }
-
-        public static ByteMatcher[] matchers(String s) {
-            return matchers(s.chars());
-        }
-
-        public static ByteMatcher[] matchers(int... bytes) {
-            return matchers(IntStream.of(bytes));
-        }
-
-        public static ByteMatcher[] matchers(IntStream byteStream) {
-            return byteStream
-                    .mapToObj(i -> new SingleByteMatcher((byte) i))
-                    .toArray(ByteMatcher[]::new);
-        }
-
-        @Override
-        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
-            int read = pushbackInputStream.read();
-            if ((byte) read == b) return new byte[]{b};
-            if (read != -1) pushbackInputStream.unread(read);
-            return null;
-        }
-    }
-
-    static class SequenceMatcher extends ByteMatcher {
-        private ByteMatcher[] matchers;
-
-        public SequenceMatcher(ByteMatcher... matchers) {
-            this.matchers = matchers;
-        }
-
-        public SequenceMatcher(String s) {
-            this(matchers(s));
-        }
-
-        public static SequenceMatcher caseInsensitive(String s) {
-            ByteMatcher[] lowerMatchers = matchers(s.toLowerCase(Locale.US));
-            ByteMatcher[] upperMatchers = matchers(s.toUpperCase(Locale.US));
-            OrMatcher[] matchers = IntStream
-                    .range(0, Math.min(lowerMatchers.length, upperMatchers.length))
-                    .mapToObj(i -> new OrMatcher(lowerMatchers[i], upperMatchers[i]))
-                    .toArray(OrMatcher[]::new);
-            return new SequenceMatcher(matchers);
-        }
-
-        @Override
-        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
-            ByteArrayOutputStream allMatched = new ByteArrayOutputStream();
-            for (ByteMatcher m : matchers) {
-                byte[] matched = m.match(pushbackInputStream);
-                if (matched == null) {
-                    pushbackInputStream.unread(allMatched.toByteArray());
-                    return null;
-                } else {
-                    allMatched.write(matched);
-                }
-            }
-            return allMatched.toByteArray();
-        }
-    }
-
-    static class OrMatcher extends ByteMatcher {
-        private ByteMatcher[] matchers;
-
-        public OrMatcher(ByteMatcher... matchers) {
-            this.matchers = matchers;
-        }
-
-        @Override
-        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
-            for (ByteMatcher m : matchers) {
-                byte[] matched = m.match(pushbackInputStream);
-                if (matched != null) return matched;
-            }
-            return null;
-        }
-    }
-
-    static class NegativeMatcher extends ByteMatcher {
-        private ByteMatcher matcher;
-
-        public NegativeMatcher(ByteMatcher matcher) {
-            this.matcher = matcher;
-        }
-
-        @Override
-        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
-            byte[] matched = matcher.match(pushbackInputStream);
-            if (matched == null) {
-                int read = pushbackInputStream.read();
-                if (read == -1) return null;
-                return new byte[]{(byte) read};
-            } else {
-                pushbackInputStream.unread(matched);
-                return null;
-            }
-        }
-    }
-
-    static class RangeMatcher extends ByteMatcher {
-        private byte low;
-        private byte high;
-
-        public RangeMatcher(byte low, byte high) {
-            this.low = low;
-            this.high = high;
-        }
-
-
-        @Override
-        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
-            int read = pushbackInputStream.read();
-            if (read >= low && read <= high) return new byte[]{(byte) read};
-            if (read != -1) pushbackInputStream.unread(read);
-            return null;
-        }
-    }
-}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java
new file mode 100644
index 0000000..4d4c7c2
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetAliases.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector;
+
+
+import org.apache.tika.parser.html.charsetdetector.charsets.ReplacementCharset;
+import org.apache.tika.parser.html.charsetdetector.charsets.XUserDefinedCharset;
+
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+
+/**
+ * Singleton class that associates standard charset names to java charset implementations
+ * https://encoding.spec.whatwg.org/#ref-for-iso-8859-8-i
+ */
+final class CharsetAliases {
+
+    private static final Map<String, Charset> charsetsByLabel = new HashMap<>();
+
+    private CharsetAliases() {
+    }
+
+    /**
+     * @param label a charset name
+     * @return the corresponding java charset, if there is one. Otherwise, null
+     */
+    static Charset getCharsetByLabel(String label) {
+        if (label == null) return null;
+        synchronized (charsetsByLabel) {
+            // Lazy initialization
+            if (charsetsByLabel.isEmpty()) addAll();
+        }
+        label = label.trim().toLowerCase(Locale.US);
+        return charsetsByLabel.get(label);
+    }
+
+    private static void addAll() {
+        addCharset(charset("Big5"), "big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5");
+        addCharset(charset("EUC-JP"), "cseucpkdfmtjapanese", "euc-jp", "x-euc-jp");
+        addCharset(charset("EUC-KR"), "cseuckr", "csksc56011987", "euc-kr", "iso-ir-149", "korean",
+                "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "windows-949");
+        addCharset(charset("GBK"), "chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312",
+                "gb_2312-80", "gbk", "iso-ir-58", "x-gbk");
+        addCharset(charset("IBM866"), "866", "cp866", "csibm866", "ibm866");
+        addCharset(charset("ISO-2022-JP"), "csiso2022jp", "iso-2022-jp");
+        addCharset(charset("ISO-8859-10", "ISO-8859-4"), "csisolatin6", "iso-8859-10", "iso-ir-157",
+                "iso8859-10", "iso885910", "l6", "latin6");
+        addCharset(charset("ISO-8859-13"), "iso-8859-13", "iso8859-13", "iso885913");
+        addCharset(charset("ISO-8859-14", "ISO-8859-1"), "iso-8859-14", "iso8859-14", "iso885914");
+        addCharset(charset("ISO-8859-15"), "csisolatin9", "iso-8859-15", "iso8859-15", "iso885915",
+                "iso_8859-15", "l9");
+        addCharset(charset("ISO-8859-16", "ISO-8859-1"), "iso-8859-16");
+        addCharset(charset("ISO-8859-2"), "csisolatin2", "iso-8859-2", "iso-ir-101", "iso8859-2",
+                "iso88592", "iso_8859-2", "iso_8859-2:1987", "l2", "latin2");
+        addCharset(charset("ISO-8859-3"), "csisolatin3", "iso-8859-3", "iso-ir-109", "iso8859-3",
+                "iso88593", "iso_8859-3", "iso_8859-3:1988", "l3", "latin3");
+        addCharset(charset("ISO-8859-4"), "csisolatin4", "iso-8859-4", "iso-ir-110", "iso8859-4",
+                "iso88594", "iso_8859-4", "iso_8859-4:1988", "l4", "latin4");
+        addCharset(charset("ISO-8859-5"), "csisolatincyrillic", "cyrillic", "iso-8859-5", "iso-ir-144",
+                "iso8859-5", "iso88595", "iso_8859-5", "iso_8859-5:1988");
+        addCharset(charset("ISO-8859-6"), "arabic", "asmo-708", "csiso88596e", "csiso88596i",
+                "csisolatinarabic", "ecma-114", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", "iso-ir-127",
+                "iso8859-6", "iso88596", "iso_8859-6", "iso_8859-6:1987");
+        addCharset(charset("ISO-8859-7"), "csisolatingreek", "ecma-118", "elot_928", "greek", "greek8",
+                "iso-8859-7", "iso-ir-126", "iso8859-7", "iso88597", "iso_8859-7", "iso_8859-7:1987", "sun_eu_greek");
+        // ISO-8859-8 actually should have an influence on the layout direction
+        // (text should be decoded in the visual order). However, this is not implemented in tika.
+        addCharset(charset("ISO-8859-8"), "csiso88598e", "csisolatinhebrew", "hebrew", "iso-8859-8",
+                "iso-8859-8-e", "iso-ir-138", "iso8859-8", "iso88598", "iso_8859-8", "iso_8859-8:1988", "visual");
+        addCharset(charset("ISO-8859-8-I", "ISO-8859-8"), "csiso88598i", "iso-8859-8-i", "logical");
+        addCharset(charset("KOI8-R"), "cskoi8r", "koi", "koi8", "koi8-r", "koi8_r");
+        addCharset(charset("KOI8-U"), "koi8-ru", "koi8-u");
+        addCharset(charset("Shift_JIS"), "csshiftjis", "ms932", "ms_kanji", "shift-jis", "shift_jis",
+                "sjis", "windows-31j", "x-sjis");
+        addCharset(charset("UTF-16BE"), "utf-16be");
+        addCharset(charset("UTF-16LE"), "utf-16", "utf-16le");
+        addCharset(charset("UTF-8"), "unicode-1-1-utf-8", "utf-8", "utf8");
+        addCharset(charset("gb18030"), "gb18030");
+        addCharset(charset("windows-1250"), "cp1250", "windows-1250", "x-cp1250");
+        addCharset(charset("windows-1251"), "cp1251", "windows-1251", "x-cp1251");
+        addCharset(charset("windows-1252"), "ansi_x3.4-1968", "ascii", "cp1252", "cp819", "csisolatin1",
+                "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", "iso_8859-1", "iso_8859-1:1987",
+                "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252");
+        addCharset(charset("windows-1253"), "cp1253", "windows-1253", "x-cp1253");
+        addCharset(charset("windows-1254"), "cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148",
+                "iso8859-9", "iso88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", "windows-1254", "x-cp1254");
+        addCharset(charset("windows-1255"), "cp1255", "windows-1255", "x-cp1255");
+        addCharset(charset("windows-1256"), "cp1256", "windows-1256", "x-cp1256");
+        addCharset(charset("windows-1257"), "cp1257", "windows-1257", "x-cp1257");
+        addCharset(charset("windows-1258"), "cp1258", "windows-1258", "x-cp1258");
+        addCharset(charset("windows-874"), "dos-874", "iso-8859-11", "iso8859-11", "iso885911",
+                "tis-620", "windows-874");
+        addCharset(charset("x-MacCyrillic"), "x-mac-cyrillic", "x-mac-ukrainian");
+        addCharset(charset("x-MacRoman"), "csmacintosh", "mac", "macintosh", "x-mac-roman");
+        // The "replacement" charset is a dummy charset. It is present to mitigate wrong-charset attacks
+        addCharset(new ReplacementCharset(), "csiso2022kr", "hz-gb-2312", "iso-2022-cn", "iso-2022-cn-ext",
+                "iso-2022-kr", "replacement");
+        // The x-user-defined charset is not present in java
+        addCharset(new XUserDefinedCharset(), "x-user-defined");
+    }
+
+    /**
+     * @param names jvm charset names
+     * @return the first of the given charsets that exists in the current JVM,
+     * or ISO_8859_1 if none exists
+     */
+    private static Charset charset(String... names) {
+        for (String name : names) {
+            try {
+                return Charset.forName(name);
+            } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {/* pass */}
+        }
+        // The only single-byte charset extended charset that must be present on every Java platform
+        return StandardCharsets.ISO_8859_1;
+    }
+
+    /**
+     * @param charset name of the charset in the JVM
+     * @param names   standard W3C charset names
+     */
+    private static void addCharset(Charset charset, String... names) {
+        for (String name : names) {
+            charsetsByLabel.put(name, charset);
+        }
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java
new file mode 100644
index 0000000..0ba3637
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/CharsetDetectionResult.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector;
+
+import java.nio.charset.Charset;
+
+import static java.nio.charset.StandardCharsets.UTF_16BE;
+import static java.nio.charset.StandardCharsets.UTF_16LE;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+
+/**
+ * A detection may either not find a charset, find an invalid charset, or find a valid charset
+ */
+class CharsetDetectionResult {
+    private boolean found = false;
+    private Charset charset = null;
+
+    private CharsetDetectionResult() { /* default result: not found */}
+
+    static CharsetDetectionResult notFound() {
+        return new CharsetDetectionResult();
+    }
+
+    public boolean isFound() {
+        return found;
+    }
+
+    public void find(String charsetName) {
+        this.found = true;
+        charsetName = charsetName.trim();
+        if ("x-user-defined".equals(charsetName)) charsetName = "windows-1252";
+        this.charset = CharsetAliases.getCharsetByLabel(charsetName);
+        // The specification states: If charset is a UTF-16 encoding, then set charset to UTF-8.
+        if (UTF_16LE.equals(charset) || UTF_16BE.equals(charset)) charset = UTF_8;
+    }
+
+    public Charset getCharset() {
+        // the result may be null even if found is true, in the case there is a charset specified,
+        // but it is invalid
+        return charset;
+    }
+
+    public void setCharset(Charset charset) {
+        this.found = true;
+        this.charset = charset;
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/MetaProcessor.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/MetaProcessor.java
new file mode 100644
index 0000000..8ce250c
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/MetaProcessor.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector;
+
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import static org.apache.tika.parser.html.charsetdetector.PreScanner.getEncodingFromMeta;
+
+
+/**
+ * A class to process the attributes of an HTML meta tag in order to extract a character set.
+ * The user should repeatedly call {@link #processAttribute} on each attributes of the tag,
+ * then update its current detection result with {@link #updateDetectedCharset(CharsetDetectionResult)}
+ * <p>
+ * The algorithm implemented is meant to match the one described by the W3C here:
+ * https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
+ */
+class MetaProcessor {
+    private Set<String> attributeNames = new HashSet<>();
+    private boolean gotPragma = false;
+    private Boolean needPragma = null; // needPragma can be null, true, or false
+    private CharsetDetectionResult detectionResult = CharsetDetectionResult.notFound();
+
+    void updateDetectedCharset(CharsetDetectionResult currentDetectionResult) {
+        if (detectionResult.isFound() &&
+                needPragma != null &&
+                !(needPragma && !gotPragma)) {
+            currentDetectionResult.setCharset(detectionResult.getCharset());
+        }
+    }
+
+    void processAttribute(Map.Entry<String, String> attribute) {
+        // Ignore duplicate attributes
+        if (attributeNames.contains(attribute.getKey())) return;
+
+        attributeNames.add(attribute.getKey());
+
+        // Handle charset-related attributes
+        switch (attribute.getKey()) {
+            case "http-equiv":
+                if (attribute.getValue().equals("content-type"))
+                    gotPragma = true;
+                break;
+            case "content":
+                String charsetName = getEncodingFromMeta(attribute.getValue());
+                if (!detectionResult.isFound() && charsetName != null) {
+                    detectionResult.find(charsetName);
+                    needPragma = true;
+                }
+                break;
+            case "charset":
+                detectionResult.find(attribute.getValue());
+                needPragma = false;
+                break;
+            default: // Ignore non-charset related attributes
+        }
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java
new file mode 100644
index 0000000..a00aeb1
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/PreScanner.java
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.AbstractMap;
+import java.util.BitSet;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * A scanner meant to detect charset meta tags in a byte stream
+ * See: https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
+ */
+class PreScanner {
+
+    private static final Pattern CHARSET_PATTERN = Pattern.compile("charset\\s*=\\s*([\"']?)([^\"'\\s;]+)\\1");
+    private static final byte[] COMMENT_START = {(byte) '<', (byte) '!', (byte) '-', (byte) '-'};
+    private static final byte[] COMMENT_END = {(byte) '-', (byte) '-', (byte) '>'};
+    private static final byte[] META_TAG_START = {(byte) '<', (byte) 'm', (byte) 'e', (byte) 't', (byte) 'a'};
+    private static final byte SLASH = (byte) '/';
+    private static final byte EQUAL = (byte) '=';
+    private static final byte TAG_START = (byte) '<';
+    private static final byte TAG_END = (byte) '>';
+    private static final BitSet QUOTE = bitSet('"', '\'');
+
+    private static final BitSet WHITESPACE = bitSet(0x09, 0x0A, 0x0C, 0x0D, 0x0D, 0x20);
+    private static final BitSet SPACE_OR_TAG_END = bitSet(WHITESPACE, TAG_END);
+    private static final BitSet SPACE_OR_SLASH = bitSet(WHITESPACE, SLASH);
+    private static final BitSet SPECIAL_TAGS = bitSet('!', '/', '?');
+
+    private static final byte[] UTF8_BOM = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
+    private static final byte[] UTF16_BE_BOM = {(byte) 0xFE, (byte) 0xFF};
+    private static final byte[] UTF16_LE_BOM = {(byte) 0xFF, (byte) 0xFE};
+    private static final byte LOWER_A = (byte) 'a';
+    private static final byte LOWER_Z = (byte) 'z';
+    private static final byte UPPER_A = (byte) 'A';
+    private static final byte UPPER_Z = (byte) 'Z';
+    private BufferedInputStream stream;
+    private CharsetDetectionResult detectedCharset = CharsetDetectionResult.notFound();
+
+    PreScanner(InputStream inputStream) {
+        this.stream = new BufferedInputStream(inputStream);
+    }
+
+    private static BitSet bitSet(int... bs) {
+        BitSet bitSet = new BitSet(0xFF);
+        for (int b : bs) bitSet.set(b);
+        return bitSet;
+    }
+
+    private static BitSet bitSet(BitSet base, int... bs) {
+        BitSet bitSet = (BitSet) base.clone();
+        for (int b : bs) bitSet.set(b);
+        return bitSet;
+    }
+
+    static String getEncodingFromMeta(String attributeValue) {
+        Matcher matcher = CHARSET_PATTERN.matcher(attributeValue);
+        if (!matcher.find()) return null;
+        return matcher.group(2);
+    }
+
+    private static boolean contains(BitSet bitSet, byte b) {
+        return bitSet.get(b & 0xFF);
+    }
+
+    Charset scan() {
+        while (processAtLeastOneByte()) {
+            if (detectedCharset.isFound()) {
+                return detectedCharset.getCharset();
+            }
+        }
+        return null;
+    }
+
+    Charset detectBOM() {
+        try {
+            if (expect(UTF8_BOM)) return StandardCharsets.UTF_8;
+            else if (expect(UTF16_BE_BOM)) return StandardCharsets.UTF_16BE;
+            else if (expect(UTF16_LE_BOM)) return StandardCharsets.UTF_16LE;
+        } catch (IOException e) { /* stream could not be read, also return null */ }
+        return null;
+    }
+
+    private boolean processAtLeastOneByte() {
+        try {
+            return processComment() ||
+                    processMeta() ||
+                    processTag() ||
+                    processSpecialTag() ||
+                    processAny();
+        } catch (IOException e) {
+            return false;
+        }
+    }
+
+    private boolean processAny() throws IOException {
+        int read = stream.read();
+        return read != -1;
+    }
+
+    private boolean processTag() throws IOException {
+        stream.mark(3);
+        if (read() == TAG_START) {
+            int read = stream.read();
+            if (read == SLASH) read = stream.read();
+            if ((LOWER_A <= read && read <= LOWER_Z) ||
+                    (UPPER_A <= read && read <= UPPER_Z)) {
+                do stream.mark(1);
+                while (!contains(SPACE_OR_TAG_END, read()));
+                stream.reset();
+                while (getAttribute() != null) {/* ignore the attribute*/}
+                return true;
+            }
+        }
+        stream.reset();
+        return false;
+    }
+
+    private boolean processSpecialTag() throws IOException {
+        stream.mark(2);
+        if (read() == TAG_START && contains(SPECIAL_TAGS, read())) {
+            skipUntil(TAG_END);
+            return true;
+        }
+        stream.reset();
+        return false;
+    }
+
+    private boolean processMeta() throws IOException {
+        stream.mark(6); // len("<meta ") == 6
+        if (readCaseInsensitive(META_TAG_START) && contains(SPACE_OR_SLASH, read())) {
+            MetaProcessor metaProcessor = new MetaProcessor();
+            for (Map.Entry<String, String> attribute = getAttribute(); attribute != null; attribute = getAttribute()) {
+                metaProcessor.processAttribute(attribute);
+            }
+            metaProcessor.updateDetectedCharset(detectedCharset);
+            return true;
+        }
+        stream.reset();
+        return false;
+    }
+
+    /**
+     * Read an attribute from the stream
+     *
+     * @return the attribute as a Map.Entry, where the key is the attribute's name and
+     * the value is the attribute's value. If there is no attribute, return null
+     */
+    private Map.Entry<String, String> getAttribute() throws IOException {
+        String name = getAttributeName();
+        if (name == null) return null;
+
+        if (!expect(EQUAL)) return new AbstractMap.SimpleEntry<>(name, "");
+        skipAll(WHITESPACE);
+
+        String value = getAttributeValue();
+        return new AbstractMap.SimpleEntry<>(name, value);
+    }
+
+    private String getAttributeName() throws IOException {
+        skipAll(SPACE_OR_SLASH);
+        if (expect(TAG_END)) return null;
+        StringBuilder name = new StringBuilder();
+        while (!(peek() == EQUAL && name.length() > 0) &&
+                !(peek() == TAG_END || peek() == SLASH) &&
+                !skipAll(WHITESPACE)) {
+            name.append((char) getLowerCaseChar());
+        }
+        return name.toString();
+    }
+
+    private String getAttributeValue() throws IOException {
+        StringBuilder value = new StringBuilder();
+        stream.mark(1);
+        byte quote = read();
+        if (contains(QUOTE, quote)) {
+            for (byte b = getLowerCaseChar(); b != quote; b = getLowerCaseChar()) {
+                value.append((char) b);
+            }
+        } else {
+            stream.reset();
+            for (byte b = getLowerCaseChar(); !contains(SPACE_OR_TAG_END, b); b = getLowerCaseChar()) {
+                value.append((char) b);
+                stream.mark(1);
+            }
+            stream.reset(); // unread the space or tag end
+        }
+        return value.toString();
+    }
+
+    private boolean skipAll(BitSet bitSet) throws IOException {
+        boolean skipped = false;
+        stream.mark(1);
+        for (byte read = read(); contains(bitSet, read); read = read()) {
+            skipped = true;
+            stream.mark(1);
+        }
+        stream.reset();
+        return skipped;
+    }
+
+    private byte getLowerCaseChar() throws IOException {
+        byte nextPoint = read();
+        if (nextPoint >= 'A' && nextPoint <= 'Z') nextPoint += 0x20; // lowercase
+        return nextPoint;
+    }
+
+    private boolean processComment() throws IOException {
+        if (!expect(COMMENT_START)) return false;
+        if (!expect(TAG_END)) skipUntil(COMMENT_END);
+        return true;
+    }
+
+    private boolean expect(byte... expected) throws IOException {
+        stream.mark(expected.length);
+        for (byte b : expected) {
+            byte read = read();
+            if (read != b) {
+                stream.reset();
+                return false;
+            }
+        }
+        return true;
+    }
+
+    private void skipUntil(byte... expected) throws IOException {
+        while (!expect(expected)) {
+            if (stream.read() == -1) return;
+        }
+    }
+
+    private boolean readCaseInsensitive(byte... bs) throws IOException {
+        for (byte b : bs) if (getLowerCaseChar() != b) return false;
+        return true;
+    }
+
+    private byte read() throws IOException {
+        int r = stream.read();
+        if (r == -1) throw new IOException();
+        return (byte) r;
+    }
+
+    private byte peek() throws IOException {
+        stream.mark(1);
+        byte b = read();
+        stream.reset();
+        return b;
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java
new file mode 100644
index 0000000..f9d1a1b
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector;
+
+import org.apache.commons.io.input.BoundedInputStream;
+import org.apache.tika.config.Field;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+
+import static org.apache.tika.parser.html.charsetdetector.CharsetAliases.getCharsetByLabel;
+
+/**
+ * An encoding detector that tries to respect the spirit of the HTML spec
+ * part 12.2.3 "The input byte stream", or at least the part that is compatible with
+ * the implementation of tika.
+ * <p>
+ * https://html.spec.whatwg.org/multipage/parsing.html#the-input-byte-stream
+ * <p>
+ * If a resource was fetched over HTTP, then HTTP headers should be added to tika metadata
+ * when using {@link #detect}, especially {@link Metadata#CONTENT_TYPE}, as it may contain charset information.
+ * <p>
+ * This encoding detector may return null if no encoding is detected.
+ * It is meant to be used inside a {@link org.apache.tika.detect.CompositeDetector}.
+ * For instance:
+ * <pre> {@code
+ *     EncodingDetector detector = new CompositeDetector(
+ *         new StandardHtmlEncodingDetector(),
+ *         new Icu4jEncodingDetector()
+ *     );
+ * }</pre>
+ * <p>
+ */
+public final class StandardHtmlEncodingDetector implements EncodingDetector {
+    private static final int META_TAG_BUFFER_SIZE = 8192;
+
+    @Field
+    private int markLimit = META_TAG_BUFFER_SIZE;
+
+    /**
+     * Extracts a charset from a Content-Type HTTP header.
+     *
+     * @param metadata parser metadata
+     * @return a charset if there is one specified, or null
+     */
+    private static Charset charsetFromContentType(Metadata metadata) {
+        String contentType = metadata.get(Metadata.CONTENT_TYPE);
+        MediaType mediatype = MediaType.parse(contentType);
+        if (mediatype == null) return null;
+        String charsetLabel = mediatype.getParameters().get("charset");
+        return getCharsetByLabel(charsetLabel);
+    }
+
+    @Override
+    public Charset detect(InputStream input, Metadata metadata) throws IOException {
+        int limit = getMarkLimit();
+        input.mark(limit);
+        // Never read more than the first META_TAG_BUFFER_SIZE bytes
+        InputStream limitedStream = new BoundedInputStream(input, limit);
+        PreScanner preScanner = new PreScanner(limitedStream);
+
+        // The order of priority for detection is:
+        // 1. Byte Order Mark
+        Charset detectedCharset = preScanner.detectBOM();
+        // 2. Transport-level information (Content-Type HTTP header)
+        if (detectedCharset == null) detectedCharset = charsetFromContentType(metadata);
+        // 3. HTML <meta> tag
+        if (detectedCharset == null) detectedCharset = preScanner.scan();
+
+        input.reset();
+        return detectedCharset;
+    }
+
+    public int getMarkLimit() {
+        return markLimit;
+    }
+
+    /**
+     * How far into the stream to read for charset detection.
+     * Default is 8192.
+     */
+    @Field
+    public void setMarkLimit(int markLimit) {
+        this.markLimit = markLimit;
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java
new file mode 100644
index 0000000..32b96cf
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/ReplacementCharset.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector.charsets;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+
+/**
+ * An implementation of the standard "replacement" charset defined by the W3C.
+ * See: https://encoding.spec.whatwg.org/#replacement
+ */
+public class ReplacementCharset extends Charset {
+
+    public ReplacementCharset() {
+        super("replacement", null);
+    }
+
+    @Override
+    public boolean contains(Charset cs) {
+        return cs.equals(this);
+    }
+
+    public CharsetDecoder newDecoder() {
+        return new CharsetDecoder(this, Float.MIN_VALUE, 1) {
+            private boolean replacementErrorReturned = false;
+
+            @Override
+            protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
+                if (in.hasRemaining() && !replacementErrorReturned) {
+                    replacementErrorReturned = true;
+                    return CoderResult.malformedForLength(in.remaining());
+                }
+                in.position(in.limit());
+                return CoderResult.UNDERFLOW;
+            }
+
+            @Override
+            protected void implReset() {
+                replacementErrorReturned = false;
+            }
+        };
+    }
+
+    public CharsetEncoder newEncoder() {
+        throw new UnsupportedOperationException("This charset does not support encoding");
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/XUserDefinedCharset.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/XUserDefinedCharset.java
new file mode 100644
index 0000000..650694a
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/charsetdetector/charsets/XUserDefinedCharset.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html.charsetdetector.charsets;
+
+import org.apache.commons.lang.NotImplementedException;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.StandardCharsets;
+
+public class XUserDefinedCharset extends Charset {
+
+    public XUserDefinedCharset() {
+        super("x-user-defined", null);
+    }
+
+    @Override
+    public boolean contains(Charset cs) {
+        return cs.equals(StandardCharsets.US_ASCII);
+    }
+
+    public CharsetDecoder newDecoder() {
+        return new CharsetDecoder(this, 1, 1) {
+            @Override
+            protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
+                while (true) {
+                    if (!in.hasRemaining()) return CoderResult.UNDERFLOW;
+                    if (!out.hasRemaining()) return CoderResult.OVERFLOW;
+                    byte b = in.get();
+                    out.append((char) ((b >= 0) ? b : 0xF700 + (b & 0xFF)));
+                }
+            }
+        };
+    }
+
+    public CharsetEncoder newEncoder() {
+        throw new NotImplementedException("Encoding to x-user-defined is not implemented");
+    }
+}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
similarity index 77%
rename from tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java
rename to tika-parsers/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
index 1c0da8d..4311887 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/StandardHtmlEncodingDetectorTest.java
@@ -19,17 +19,23 @@ package org.apache.tika.parser.html;
 
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector;
+import org.apache.tika.parser.html.charsetdetector.charsets.ReplacementCharset;
 import org.junit.Before;
-import org.junit.Ignore;
 import org.junit.Test;
 
-import java.io.*;
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.SequenceInputStream;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 
+import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
 
-public class StrictHtmlEncodingDetectorTest {
+public class StandardHtmlEncodingDetectorTest {
     private Metadata metadata = new Metadata();
 
     @Before
@@ -39,6 +45,11 @@ public class StrictHtmlEncodingDetectorTest {
 
     @Test
     public void basic() throws IOException {
+        assertWindows1252("<meta charset=WINDOWS-1252>");
+    }
+
+    @Test
+    public void quoted() throws IOException {
         assertWindows1252("<meta charset='WINDOWS-1252'>");
     }
 
@@ -49,6 +60,22 @@ public class StrictHtmlEncodingDetectorTest {
     }
 
     @Test
+    public void duplicateAttribute() throws IOException {
+        assertWindows1252("<meta charset='WINDOWS-1252' charset='UTF-8'>");
+    }
+
+    @Test
+    public void invalidThenValid() throws IOException {
+        assertCharset("<meta charset=blah>" +
+                "<meta charset=WINDOWS-1252>", null);
+    }
+
+    @Test
+    public void spacesInAttributes() throws IOException {
+        assertWindows1252("<meta charset\u000C=  \t  WINDOWS-1252>");
+    }
+
+    @Test
     public void httpEquiv() throws IOException {
         assertWindows1252("<meta " +
                 "http-equiv='content-type' " +
@@ -59,6 +86,11 @@ public class StrictHtmlEncodingDetectorTest {
     }
 
     @Test
+    public void emptyAttributeEnd() throws IOException {
+        assertWindows1252("<meta charset=WINDOWS-1252 a>");
+    }
+
+    @Test
     public void httpEquivDuplicateCharset() throws IOException {
         assertWindows1252("<meta " +
                 "http-equiv='content-type' " +
@@ -72,19 +104,50 @@ public class StrictHtmlEncodingDetectorTest {
     }
 
     @Test
-    public void verBadHtml() throws IOException {
+    public void veryBadHtml() throws IOException {
         // check that the parser is not confused by garbage before the declaration
         assertWindows1252("<< l \" == / '=x\n >" +
                 "<!--> " +
                 "< <x'/ <=> " +
                 "<meta/>" +
+                "<meta>" +
                 "<a x/>" +
                 "<meta charset='WINDOWS-1252'>");
     }
 
     @Test
+    public void specialTag() throws IOException {
+        // special tags cannot have arguments, any '>' ends them
+        assertWindows1252("<? x='><meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void longHtml() throws IOException {
+        StringBuilder sb = new StringBuilder("<!doctype html>\n" +
+                "<html>\n" +
+                "<head>\n" +
+                "<title>Hello world</title>\n");
+        String repeated = "<meta x='y' />\n";
+        String charsetMeta = "<meta charset='windows-1252'>";
+
+        while (sb.length() + repeated.length() + charsetMeta.length() < 1024) sb.append(repeated);
+
+        sb.append(charsetMeta);
+
+        assertWindows1252(sb.toString());
+    }
+
+    @Test
+    public void tooLong() throws IOException {
+        // Create a string with 1Mb of '\0' followed by a meta
+        String padded = new String(new byte[1000000]) + "<meta charset='windows-1252'>";
+        // Only the first bytes should be prescanned, so the algorithm should stop before the meta tag
+        assertCharset(padded, null);
+    }
+
+    @Test
     public void incompleteMeta() throws IOException {
-        assertWindows1252("<meta charset='WINDOWS-1252'"); // missing '>' at the end
+        assertCharset("<meta charset='WINDOWS-1252'", null); // missing '>' at the end
     }
 
     @Test
@@ -110,6 +173,13 @@ public class StrictHtmlEncodingDetectorTest {
     }
 
     @Test
+    public void replacement() throws IOException {
+        // Several dangerous charsets should are aliases of 'replacement' in the spec
+        String inString = "<meta charset='iso-2022-cn'>";
+        assertCharset(new ByteArrayInputStream(inString.getBytes()), new ReplacementCharset());
+    }
+
+    @Test
     public void iso88591() throws IOException {
         // In the spec, iso-8859-1 is an alias for WINDOWS-1252
         assertWindows1252("<meta charset='iso-8859-1'>");
@@ -226,8 +296,8 @@ public class StrictHtmlEncodingDetectorTest {
     }
 
     @Test
-    public void withUserProvidedCharset() throws IOException {
-        metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
+    public void withCharsetInContentType() throws IOException {
+        metadata.set(Metadata.CONTENT_TYPE, "text/html; Charset=ISO-8859-1");
         // ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level
         assertWindows1252("");
         assertWindows1252("<meta charset='UTF-8'>");
@@ -262,6 +332,18 @@ public class StrictHtmlEncodingDetectorTest {
         assertCharset(throwAfter("<!doctype html><html attr='x"), null);
     }
 
+    @Test
+    public void streamReset() throws IOException {
+        // The stream should be reset after detection
+        byte[] inBytes = {0,1,2,3,4};
+        byte[] outBytes = new byte[5];
+        InputStream inStream = new ByteArrayInputStream(inBytes);
+        detectCharset(inStream);
+        // The stream should still be readable from the beginning after detection
+        inStream.read(outBytes);
+        assertArrayEquals(inBytes, outBytes);
+    }
+
     private void assertWindows1252(String html) throws IOException {
         assertCharset(html, Charset.forName("WINDOWS-1252"));
     }
@@ -283,7 +365,7 @@ public class StrictHtmlEncodingDetectorTest {
     }
 
     private Charset detectCharset(InputStream inStream) throws IOException {
-        return new StrictHtmlEncodingDetector().detect(inStream, metadata);
+        return new StandardHtmlEncodingDetector().detect(inStream, metadata);
     }
 
     private InputStream throwAfter(String html) {
@@ -295,6 +377,6 @@ public class StrictHtmlEncodingDetectorTest {
                 throw new IOException("test exception");
             }
         };
-        return new SequenceInputStream(contentsInStream, errorThrowing);
+        return new BufferedInputStream(new SequenceInputStream(contentsInStream, errorThrowing));
     }
 }