You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tika.apache.org by ta...@apache.org on 2018/07/06 15:53:34 UTC

[tika] branch master updated (c9a81a4 -> 790c124)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from c9a81a4  TIKA-2675 -- OpenDocumentParser should fail on invalid zip via Sebastian Nagel and PR-240.
     new 66417f6  improve htmlparser
     new 790c124  TIKA-2673 -- add StrictHtmlEncodingDetector, contributed by Gerard Bouchar

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../java/org/apache/tika/io/TikaInputStream.java   |  11 +-
 .../org/apache/tika/parser/html/HtmlParser.java    |  25 ++
 .../parser/html/StrictHtmlEncodingDetector.java    | 491 +++++++++++++++++++++
 .../tika/parser/html/whatwg-encoding-labels.tsv    | 234 ++++++++++
 .../html/StrictHtmlEncodingDetectorTest.java       | 300 +++++++++++++
 5 files changed, 1060 insertions(+), 1 deletion(-)
 create mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
 create mode 100644 tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv
 create mode 100644 tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java

[tika] 02/02: TIKA-2673 -- add StrictHtmlEncodingDetector, contributed by Gerard Bouchar

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 790c1248207371e6cb2a3e7a1ec3a021503ec7a4
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Jul 6 11:53:24 2018 -0400

    TIKA-2673 -- add StrictHtmlEncodingDetector, contributed by Gerard Bouchar
---
 .../parser/html/StrictHtmlEncodingDetector.java    | 491 +++++++++++++++++++++
 .../tika/parser/html/whatwg-encoding-labels.tsv    | 234 ++++++++++
 .../html/StrictHtmlEncodingDetectorTest.java       | 300 +++++++++++++
 3 files changed, 1025 insertions(+)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
new file mode 100644
index 0000000..487f747
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
@@ -0,0 +1,491 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static java.nio.charset.StandardCharsets.*;
+import static org.apache.tika.parser.html.StrictHtmlEncodingDetector.SequenceMatcher.caseInsensitive;
+import static org.apache.tika.parser.html.StrictHtmlEncodingDetector.SingleByteMatcher.matchers;
+
+/**
+ * This is a strict html encoding detector that enforces the standard
+ * far more strictly than the HtmlEncodingDetector.
+ */
+public class StrictHtmlEncodingDetector implements EncodingDetector {
+    private static final String CHARSET_LABEL_FILE = "whatwg-encoding-labels.tsv";
+    private static Map<String, Charset> CHARSET_LABELS = getCharsetLabels();
+
+    private static Map<String, Charset> getCharsetLabels() {
+        String path = StrictHtmlEncodingDetector.class.getPackage().getName().replace('.', '/');
+        String filename = '/' + path + '/' + CHARSET_LABEL_FILE;
+        InputStream inputStream = StrictHtmlEncodingDetector.class.getResourceAsStream(filename);
+        Objects.requireNonNull(inputStream, "Missing charset label mapping file : " + filename);
+        try (BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.US_ASCII))) {
+            return buffer.lines()
+                    .filter(s -> !s.startsWith("#"))
+                    .map(s -> s.split("\t"))
+                    .filter(parts -> parts.length >= 2)
+                    .collect(Collectors.toMap(
+                            parts -> parts[0],
+                            StrictHtmlEncodingDetector::charsetFromStandard
+                    ));
+        } catch (IOException e) {
+            throw new UncheckedIOException("Unable to read the charset label mapping", e);
+        }
+    }
+
+    private static Charset charsetFromStandard(String[] names) {
+        for (int i = 1; i < names.length; i++) {
+            try {
+                return Charset.forName(names[1]);
+            } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {/* pass */}
+        }
+        // The only single-byte charset extended charset that must be present on every Java platform
+        return StandardCharsets.ISO_8859_1;
+    }
+
+    private static Charset getCharsetByLabel(String label) {
+        if (label == null) return null;
+        label = label.trim().toLowerCase(Locale.US);
+        return CHARSET_LABELS.get(label);
+    }
+
+    @Override
+    public Charset detect(InputStream input, Metadata metadata) throws IOException {
+        PreScanner preScanner = new PreScanner(input);
+
+        // If there is a BOM at the beginning, the detection does not go further
+        Charset bomCharset = preScanner.detectBOM();
+        if (bomCharset != null) return bomCharset;
+
+        // Assume that if there was a charset specified either by the end user or the transport level,
+        // it was stored in the metadata
+        String incomingCharsetName = metadata.get(Metadata.CONTENT_ENCODING);
+        if (incomingCharsetName != null) {
+            Charset incomingCharset = getCharsetByLabel(incomingCharsetName);
+            if (incomingCharset != null) return incomingCharset;
+        }
+
+        return preScanner.scan();
+    }
+
+    static class PreScanner {
+
+        private static final Pattern META_CHARSET_PATTERN = Pattern.compile("charset\\s*=\\s*([\"']?)([^\"'\\s;]+)\\1");
+        private static ByteMatcher COMMENT_START = new SequenceMatcher("<!--");
+        private static ByteMatcher COMMENT_END = new SequenceMatcher("-->");
+        private static ByteMatcher LETTER = new OrMatcher(
+                new RangeMatcher((byte) 'a', (byte) 'z'),
+                new RangeMatcher((byte) 'A', (byte) 'Z')
+        );
+        private static ByteMatcher SPACE = new OrMatcher(matchers(0x09, 0x0A, 0x0C, 0x0D, 0x20));
+        private static ByteMatcher SLASH = new SingleByteMatcher((byte) '/');
+        private static ByteMatcher EQUAL = new SingleByteMatcher((byte) '=');
+        private static ByteMatcher TAG_END = new SingleByteMatcher((byte) '>');
+        private static ByteMatcher SINGLE_QUOTE = new SingleByteMatcher((byte) '\'');
+        private static ByteMatcher DOUBLE_QUOTE = new SingleByteMatcher((byte) '"');
+        private static ByteMatcher QUOTE = new OrMatcher(SINGLE_QUOTE, DOUBLE_QUOTE);
+        private static ByteMatcher TAG_END_OR_SLASH = new OrMatcher(SLASH, TAG_END);
+        private static ByteMatcher SPACE_OR_SLASH = new OrMatcher(SPACE, SLASH);
+        private static ByteMatcher SPACE_OR_TAG_END = new OrMatcher(SPACE, TAG_END);
+        private static ByteMatcher META_START = new SequenceMatcher(caseInsensitive("<meta"), SPACE_OR_SLASH);
+        private static ByteMatcher TAG_START = new SequenceMatcher(
+                new SingleByteMatcher((byte) '<'),
+                new OrMatcher(SLASH, LETTER)
+        );
+        private static ByteMatcher TAG_BODY = new NegativeMatcher(new OrMatcher(SPACE, TAG_END));
+        private static ByteMatcher SPECIAL_TAG_START = new SequenceMatcher(
+                new SingleByteMatcher((byte) '<'),
+                new OrMatcher(matchers("!/?"))
+        );
+        private static ByteMatcher UTF8_BOM = new SequenceMatcher(matchers(0xEF, 0xBB, 0xBF));
+        private static ByteMatcher UTF16_BE_BOM = new SequenceMatcher(matchers(0xFE, 0xFF));
+        private static ByteMatcher UTF16_LE_BOM = new SequenceMatcher(matchers(0xFF, 0xFE));
+
+
+        PushbackInputStream stream;
+        private CharsetDetectionResult detectedCharset = new CharsetDetectionResult();
+
+        public PreScanner(InputStream inputStream) {
+            this.stream = new PushbackInputStream(inputStream, 32);
+        }
+
+        public Charset scan() {
+            while (processAtLeastOneByte()) {
+                if (detectedCharset.isFound()) {
+                    return detectedCharset.getCharset();
+                }
+            }
+            return null;
+        }
+
+        private Charset detectBOM() {
+            try {
+                if (UTF8_BOM.matches(stream)) return StandardCharsets.UTF_8;
+                else if (UTF16_BE_BOM.matches(stream)) return StandardCharsets.UTF_16BE;
+                else if (UTF16_LE_BOM.matches(stream)) return StandardCharsets.UTF_16LE;
+            } catch (IOException e) { /* stream could not be read, also return null */ }
+            return null;
+        }
+
+        private boolean processAtLeastOneByte() {
+            try {
+                return processComment() ||
+                        processMeta() ||
+                        processTag() ||
+                        processSpecialTag() ||
+                        processAny();
+            } catch (IOException e) {
+                return false;
+            }
+        }
+
+        private boolean processAny() throws IOException {
+            int read = stream.read();
+            return read != -1;
+        }
+
+        private boolean hasBytes() throws IOException {
+            int read = stream.read();
+            if (read != -1) stream.unread(read);
+            return read != -1;
+        }
+
+        private boolean processComment() throws IOException {
+            if (COMMENT_START.matches(stream)) {
+                // The two '-' in the '-->' sequence can be the same as those in the '<!--' sequence.
+                stream.unread("--".getBytes(StandardCharsets.US_ASCII));
+                return COMMENT_END.advanceUntilMatches(stream);
+            }
+            return false;
+        }
+
+        private boolean processTag() throws IOException {
+            if (TAG_START.matches(stream)) {
+                TAG_BODY.skipAll(stream);
+                while (getAttribute() != null) {/*ignore the attribute*/}
+                return true;
+            }
+            return false;
+        }
+
+        private boolean processSpecialTag() throws IOException {
+            if (SPECIAL_TAG_START.matches(stream)) {
+                TAG_BODY.skipAll(stream);
+                return TAG_END.advanceUntilMatches(stream);
+            }
+            return false;
+        }
+
+        private boolean processMeta() throws IOException {
+            if (META_START.matches(stream)) {
+                Set<String> attributeNames = new HashSet<>();
+                boolean gotPragma = false;
+                Boolean needPragma = null;
+                CharsetDetectionResult charset = new CharsetDetectionResult();
+                while (hasBytes()) {
+                    Attribute attribute = getAttribute();
+                    if (attribute == null) break;
+                    if (attributeNames.contains(attribute.getName())) continue;
+                    attributeNames.add(attribute.getName());
+                    switch (attribute.getName()) {
+                        case "http-equiv":
+                            if (attribute.getValue().equals("content-type"))
+                                gotPragma = true;
+                            break;
+                        case "content":
+                            String charsetName = getEncodingFromMeta(attribute.getValue());
+                            if (!charset.isFound() && charsetName != null) {
+                                charset.find(charsetName);
+                                needPragma = true;
+                            }
+                            break;
+                        case "charset":
+                            charset.find(attribute.getValue());
+                            needPragma = false;
+                            break;
+                        default: // Ignore non-charset related attributes
+                    }
+                }
+                if (needPragma != null && !(needPragma && !gotPragma)) {
+                    detectedCharset = charset;
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        private String getEncodingFromMeta(String attributeValue) {
+            Matcher matcher = META_CHARSET_PATTERN.matcher(attributeValue);
+            if (!matcher.find()) return null;
+            return matcher.group(2);
+        }
+
+        private Attribute getAttribute() throws IOException {
+            SPACE_OR_SLASH.skipAll(stream);
+            if (TAG_END.peekMatches(stream)) return null;
+            StringBuilder name = new StringBuilder();
+            while (!EQUAL.peekMatches(stream) || name.length() == 0) {
+                if (TAG_END_OR_SLASH.peekMatches(stream)) {
+                    break;
+                } else if (SPACE.peekMatches(stream)) {
+                    SPACE.skipAll(stream);
+                    break;
+                } else {
+                    name.append(getLowerCaseChar());
+                }
+            }
+
+            if (!EQUAL.matches(stream)) return new Attribute(name.toString(), "");
+            SPACE.skipAll(stream);
+
+            StringBuilder value = new StringBuilder();
+            byte[] quoteMatched = QUOTE.match(stream);
+            if (quoteMatched != null) {
+                char quote = (char) quoteMatched[0];
+                int nextChar = -1;
+                while (nextChar != quote) {
+                    if (nextChar != -1) value.append((char) nextChar);
+                    nextChar = getLowerCaseChar();
+                }
+            } else {
+                while (!SPACE_OR_TAG_END.peekMatches(stream)) {
+                    value.append(getLowerCaseChar());
+                }
+            }
+            return new Attribute(name.toString(), value.toString());
+        }
+
+        private char getLowerCaseChar() throws IOException {
+            int nextPoint = stream.read();
+            if (nextPoint == -1) throw new IOException();
+            if (nextPoint >= 'A' && nextPoint <= 'Z') nextPoint += 0x20; // lowercase
+            return (char) nextPoint;
+        }
+    }
+
+    static class Attribute {
+        String name;
+        String value;
+
+        public Attribute(String name, String value) {
+            this.name = name;
+            this.value = value;
+        }
+
+        public String getName() {
+            return name;
+        }
+
+        public String getValue() {
+            return value;
+        }
+    }
+
+    /**
+     * A detection may either not find a charset, find an invalid charset, or find a valid charset
+     */
+    static class CharsetDetectionResult {
+        private boolean found = false;
+        private Charset charset = null;
+
+        public CharsetDetectionResult() { /* default result: not found */}
+
+        public boolean isFound() {
+            return found;
+        }
+
+        public void find(String charsetName) {
+            this.found = true;
+            charsetName = charsetName.trim();
+            if ("x-user-defined".equals(charsetName)) charsetName = "windows-1252";
+            this.charset = getCharsetByLabel(charsetName);
+            // The specification states: If charset is a UTF-16 encoding, then set charset to UTF-8.
+            if (UTF_16LE.equals(charset) || UTF_16BE.equals(charset)) charset = UTF_8;
+        }
+
+        public Charset getCharset() {
+            // the result may be null even if found is true, in the case there is a charset specified,
+            // but it is invalid
+            return charset;
+        }
+    }
+
+    static abstract class ByteMatcher {
+
+        abstract byte[] match(PushbackInputStream pushbackInputStream) throws IOException;
+
+        boolean matches(PushbackInputStream pushbackInputStream) throws IOException {
+            return this.match(pushbackInputStream) != null;
+        }
+
+        boolean advanceUntilMatches(PushbackInputStream pushbackInputStream) throws IOException {
+            while (!this.matches(pushbackInputStream)) {
+                int nextByte = pushbackInputStream.read();
+                if (nextByte == -1) return false;
+            }
+            return true;
+        }
+
+        void skipAll(PushbackInputStream pushbackInputStream) throws IOException {
+            while (matches(pushbackInputStream)) {/* just skip the byte */}
+        }
+
+        public boolean peekMatches(PushbackInputStream pushbackInputStream) throws IOException {
+            byte[] matched = this.match(pushbackInputStream);
+            if (matched != null) pushbackInputStream.unread(matched);
+            return matched != null;
+        }
+    }
+
+    static class SingleByteMatcher extends ByteMatcher {
+        private byte b;
+
+        public SingleByteMatcher(byte b) {
+            this.b = b;
+        }
+
+        public static ByteMatcher[] matchers(String s) {
+            return matchers(s.chars());
+        }
+
+        public static ByteMatcher[] matchers(int... bytes) {
+            return matchers(IntStream.of(bytes));
+        }
+
+        public static ByteMatcher[] matchers(IntStream byteStream) {
+            return byteStream
+                    .mapToObj(i -> new SingleByteMatcher((byte) i))
+                    .toArray(ByteMatcher[]::new);
+        }
+
+        @Override
+        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+            int read = pushbackInputStream.read();
+            if ((byte) read == b) return new byte[]{b};
+            if (read != -1) pushbackInputStream.unread(read);
+            return null;
+        }
+    }
+
+    static class SequenceMatcher extends ByteMatcher {
+        private ByteMatcher[] matchers;
+
+        public SequenceMatcher(ByteMatcher... matchers) {
+            this.matchers = matchers;
+        }
+
+        public SequenceMatcher(String s) {
+            this(matchers(s));
+        }
+
+        public static SequenceMatcher caseInsensitive(String s) {
+            ByteMatcher[] lowerMatchers = matchers(s.toLowerCase(Locale.US));
+            ByteMatcher[] upperMatchers = matchers(s.toUpperCase(Locale.US));
+            OrMatcher[] matchers = IntStream
+                    .range(0, Math.min(lowerMatchers.length, upperMatchers.length))
+                    .mapToObj(i -> new OrMatcher(lowerMatchers[i], upperMatchers[i]))
+                    .toArray(OrMatcher[]::new);
+            return new SequenceMatcher(matchers);
+        }
+
+        @Override
+        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+            ByteArrayOutputStream allMatched = new ByteArrayOutputStream();
+            for (ByteMatcher m : matchers) {
+                byte[] matched = m.match(pushbackInputStream);
+                if (matched == null) {
+                    pushbackInputStream.unread(allMatched.toByteArray());
+                    return null;
+                } else {
+                    allMatched.write(matched);
+                }
+            }
+            return allMatched.toByteArray();
+        }
+    }
+
+    static class OrMatcher extends ByteMatcher {
+        private ByteMatcher[] matchers;
+
+        public OrMatcher(ByteMatcher... matchers) {
+            this.matchers = matchers;
+        }
+
+        @Override
+        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+            for (ByteMatcher m : matchers) {
+                byte[] matched = m.match(pushbackInputStream);
+                if (matched != null) return matched;
+            }
+            return null;
+        }
+    }
+
+    static class NegativeMatcher extends ByteMatcher {
+        private ByteMatcher matcher;
+
+        public NegativeMatcher(ByteMatcher matcher) {
+            this.matcher = matcher;
+        }
+
+        @Override
+        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+            byte[] matched = matcher.match(pushbackInputStream);
+            if (matched == null) {
+                int read = pushbackInputStream.read();
+                if (read == -1) return null;
+                return new byte[]{(byte) read};
+            } else {
+                pushbackInputStream.unread(matched);
+                return null;
+            }
+        }
+    }
+
+    static class RangeMatcher extends ByteMatcher {
+        private byte low;
+        private byte high;
+
+        public RangeMatcher(byte low, byte high) {
+            this.low = low;
+            this.high = high;
+        }
+
+
+        @Override
+        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+            int read = pushbackInputStream.read();
+            if (read >= low && read <= high) return new byte[]{(byte) read};
+            if (read != -1) pushbackInputStream.unread(read);
+            return null;
+        }
+    }
+}
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv b/tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv
new file mode 100644
index 0000000..92ddecb
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv
@@ -0,0 +1,234 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.unicode-1-1-utf-8	UTF-8
+#
+# label	encoding	fallback
+utf-8	UTF-8
+utf8	UTF-8
+866	IBM866
+cp866	IBM866
+csibm866	IBM866
+ibm866	IBM866
+csisolatin2	ISO-8859-2
+iso-8859-2	ISO-8859-2
+iso-ir-101	ISO-8859-2
+iso8859-2	ISO-8859-2
+iso88592	ISO-8859-2
+iso_8859-2	ISO-8859-2
+iso_8859-2:1987	ISO-8859-2
+l2	ISO-8859-2
+latin2	ISO-8859-2
+csisolatin3	ISO-8859-3
+iso-8859-3	ISO-8859-3
+iso-ir-109	ISO-8859-3
+iso8859-3	ISO-8859-3
+iso88593	ISO-8859-3
+iso_8859-3	ISO-8859-3
+iso_8859-3:1988	ISO-8859-3
+l3	ISO-8859-3
+latin3	ISO-8859-3
+csisolatin4	ISO-8859-4
+iso-8859-4	ISO-8859-4
+iso-ir-110	ISO-8859-4
+iso8859-4	ISO-8859-4
+iso88594	ISO-8859-4
+iso_8859-4	ISO-8859-4
+iso_8859-4:1988	ISO-8859-4
+l4	ISO-8859-4
+latin4	ISO-8859-4
+csisolatincyrillic	ISO-8859-5
+cyrillic	ISO-8859-5
+iso-8859-5	ISO-8859-5
+iso-ir-144	ISO-8859-5
+iso8859-5	ISO-8859-5
+iso88595	ISO-8859-5
+iso_8859-5	ISO-8859-5
+iso_8859-5:1988	ISO-8859-5
+arabic	ISO-8859-6
+asmo-708	ISO-8859-6
+csiso88596e	ISO-8859-6
+csiso88596i	ISO-8859-6
+csisolatinarabic	ISO-8859-6
+ecma-114	ISO-8859-6
+iso-8859-6	ISO-8859-6
+iso-8859-6-e	ISO-8859-6
+iso-8859-6-i	ISO-8859-6
+iso-ir-127	ISO-8859-6
+iso8859-6	ISO-8859-6
+iso88596	ISO-8859-6
+iso_8859-6	ISO-8859-6
+iso_8859-6:1987	ISO-8859-6
+csisolatingreek	ISO-8859-7
+ecma-118	ISO-8859-7
+elot_928	ISO-8859-7
+greek	ISO-8859-7
+greek8	ISO-8859-7
+iso-8859-7	ISO-8859-7
+iso-ir-126	ISO-8859-7
+iso8859-7	ISO-8859-7
+iso88597	ISO-8859-7
+iso_8859-7	ISO-8859-7
+iso_8859-7:1987	ISO-8859-7
+sun_eu_greek	ISO-8859-7
+csiso88598e	ISO-8859-8
+csisolatinhebrew	ISO-8859-8
+hebrew	ISO-8859-8
+iso-8859-8	ISO-8859-8
+iso-8859-8-e	ISO-8859-8
+iso-ir-138	ISO-8859-8
+iso8859-8	ISO-8859-8
+iso88598	ISO-8859-8
+iso_8859-8	ISO-8859-8
+iso_8859-8:1988	ISO-8859-8
+visual	ISO-8859-8
+csiso88598i	ISO-8859-8-I	ISO-8859-8
+iso-8859-8-i	ISO-8859-8-I	ISO-8859-8
+logical	ISO-8859-8-I	ISO-8859-8
+csisolatin6	ISO-8859-10	ISO-8859-4
+iso-8859-10	ISO-8859-10	ISO-8859-4
+iso-ir-157	ISO-8859-10	ISO-8859-4
+iso8859-10	ISO-8859-10	ISO-8859-4
+iso885910	ISO-8859-10	ISO-8859-4
+l6	ISO-8859-10	ISO-8859-4
+latin6	ISO-8859-10	ISO-8859-4
+iso-8859-13	ISO-8859-13
+iso8859-13	ISO-8859-13
+iso885913	ISO-8859-13
+iso-8859-14	ISO-8859-14	ISO-8859-1
+iso8859-14	ISO-8859-14	ISO-8859-1
+iso885914	ISO-8859-14	ISO-8859-1
+csisolatin9	ISO-8859-15
+iso-8859-15	ISO-8859-15
+iso8859-15	ISO-8859-15
+iso885915	ISO-8859-15
+iso_8859-15	ISO-8859-15
+l9	ISO-8859-15
+iso-8859-16	ISO-8859-16	ISO-8859-1
+cskoi8r	KOI8-R
+koi	KOI8-R
+koi8	KOI8-R
+koi8-r	KOI8-R
+koi8_r	KOI8-R
+koi8-ru	KOI8-U
+koi8-u	KOI8-U
+csmacintosh	x-MacRoman
+mac	x-MacRoman
+macintosh	x-MacRoman
+x-mac-roman	x-MacRoman
+dos-874	windows-874
+iso-8859-11	windows-874
+iso8859-11	windows-874
+iso885911	windows-874
+tis-620	windows-874
+windows-874	windows-874
+cp1250	windows-1250
+windows-1250	windows-1250
+x-cp1250	windows-1250
+cp1251	windows-1251
+windows-1251	windows-1251
+x-cp1251	windows-1251
+ansi_x3.4-1968	windows-1252
+ascii	windows-1252
+cp1252	windows-1252
+cp819	windows-1252
+csisolatin1	windows-1252
+ibm819	windows-1252
+iso-8859-1	windows-1252
+iso-ir-100	windows-1252
+iso8859-1	windows-1252
+iso88591	windows-1252
+iso_8859-1	windows-1252
+iso_8859-1:1987	windows-1252
+l1	windows-1252
+latin1	windows-1252
+us-ascii	windows-1252
+windows-1252	windows-1252
+x-cp1252	windows-1252
+cp1253	windows-1253
+windows-1253	windows-1253
+x-cp1253	windows-1253
+cp1254	windows-1254
+csisolatin5	windows-1254
+iso-8859-9	windows-1254
+iso-ir-148	windows-1254
+iso8859-9	windows-1254
+iso88599	windows-1254
+iso_8859-9	windows-1254
+iso_8859-9:1989	windows-1254
+l5	windows-1254
+latin5	windows-1254
+windows-1254	windows-1254
+x-cp1254	windows-1254
+cp1255	windows-1255
+windows-1255	windows-1255
+x-cp1255	windows-1255
+cp1256	windows-1256
+windows-1256	windows-1256
+x-cp1256	windows-1256
+cp1257	windows-1257
+windows-1257	windows-1257
+x-cp1257	windows-1257
+cp1258	windows-1258
+windows-1258	windows-1258
+x-cp1258	windows-1258
+x-mac-cyrillic	x-MacCyrillic
+x-mac-ukrainian	x-MacCyrillic
+chinese	GBK
+csgb2312	GBK
+csiso58gb231280	GBK
+gb2312	GBK
+gb_2312	GBK
+gb_2312-80	GBK
+gbk	GBK
+iso-ir-58	GBK
+x-gbk	GBK
+gb18030	gb18030
+big5	Big5
+big5-hkscs	Big5
+cn-big5	Big5
+csbig5	Big5
+x-x-big5	Big5
+cseucpkdfmtjapanese	EUC-JP
+euc-jp	EUC-JP
+x-euc-jp	EUC-JP
+csiso2022jp	ISO-2022-JP
+iso-2022-jp	ISO-2022-JP
+csshiftjis	Shift_JIS
+ms932	Shift_JIS
+ms_kanji	Shift_JIS
+shift-jis	Shift_JIS
+shift_jis	Shift_JIS
+sjis	Shift_JIS
+windows-31j	Shift_JIS
+x-sjis	Shift_JIS
+cseuckr	EUC-KR
+csksc56011987	EUC-KR
+euc-kr	EUC-KR
+iso-ir-149	EUC-KR
+korean	EUC-KR
+ks_c_5601-1987	EUC-KR
+ks_c_5601-1989	EUC-KR
+ksc5601	EUC-KR
+ksc_5601	EUC-KR
+windows-949	EUC-KR
+csiso2022kr	replacement
+hz-gb-2312	replacement
+iso-2022-cn	replacement
+iso-2022-cn-ext	replacement
+iso-2022-kr	replacement
+replacement	replacement
+utf-16be	UTF-16BE
+utf-16	UTF-16LE
+utf-16le	UTF-16LE
+x-user-defined	x-user-defined
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java
new file mode 100644
index 0000000..1c0da8d
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java
@@ -0,0 +1,300 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.html;
+
+
+import org.apache.tika.metadata.Metadata;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.Assert.assertEquals;
+
+public class StrictHtmlEncodingDetectorTest {
+    private Metadata metadata = new Metadata();
+
+    @Before
+    public void setUp() {
+        this.metadata = new Metadata();
+    }
+
+    @Test
+    public void basic() throws IOException {
+        assertWindows1252("<meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void duplicateMeta() throws IOException {
+        assertWindows1252("<meta charset='WINDOWS-1252'>" +
+                "<meta charset='UTF-8'>");
+    }
+
+    @Test
+    public void httpEquiv() throws IOException {
+        assertWindows1252("<meta " +
+                "http-equiv='content-type' " +
+                "content='text/html; charset=\"WINDOWS-1252\"'>"); // quotes around the charset are allowed
+        assertWindows1252("<meta " +
+                "content=' charset  =  WINDOWS-1252' " + // The charset may be anywhere in the content attribute
+                "http-equiv='content-type' >");
+    }
+
+    @Test
+    public void httpEquivDuplicateCharset() throws IOException {
+        assertWindows1252("<meta " +
+                "http-equiv='content-type' " +
+                "content='charset=WINDOWS-1252;" + // The detection should stop after the semicolon
+                "charset=UTF-8'>");
+    }
+
+    @Test
+    public void htmlFragment() throws IOException {
+        assertWindows1252("<!doctype html><html class=nojs><head><meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void verBadHtml() throws IOException {
+        // check that the parser is not confused by garbage before the declaration
+        assertWindows1252("<< l \" == / '=x\n >" +
+                "<!--> " +
+                "< <x'/ <=> " +
+                "<meta/>" +
+                "<a x/>" +
+                "<meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void incompleteMeta() throws IOException {
+        assertWindows1252("<meta charset='WINDOWS-1252'"); // missing '>' at the end
+    }
+
+    @Test
+    public void charsetWithWhiteSpaces() throws IOException {
+        assertWindows1252("<meta charset='   \t\n  WINDOWS-1252 \t\n'>");
+    }
+
+    @Test
+    public void mixedCase() throws IOException {
+        assertWindows1252("<mEtA chArsEt='WInDOWs-1252'>");
+    }
+
+    @Test
+    public void utf16() throws IOException {
+        // According to the specification 'If charset is a UTF-16 encoding, then set charset to UTF-8.'
+        assertCharset("<meta charset='UTF-16BE'>", StandardCharsets.UTF_8);
+    }
+
+    @Test
+    public void xUserDefined() throws IOException {
+        // According to the specification 'If charset is x-user-defined, then set charset to windows-1252.'
+        assertWindows1252("<meta charset='x-user-defined'>");
+    }
+
+    @Test
+    public void iso88591() throws IOException {
+        // In the spec, iso-8859-1 is an alias for WINDOWS-1252
+        assertWindows1252("<meta charset='iso-8859-1'>");
+    }
+
+    @Test
+    public void macintoshEncoding() throws IOException {
+        // The mac roman encoding exists in java, but under the name x-MacRoman
+        assertCharset("<meta charset='macintosh'>", Charset.forName("x-MacRoman"));
+    }
+
+    @Test
+    public void bom() throws IOException {
+        // A BOM should have precedence over the meta
+        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
+        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
+        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE);
+    }
+
+    @Test
+    public void withSlash() throws IOException {
+        assertWindows1252("<meta/charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void insideDescription() throws IOException {
+        assertWindows1252("<meta name='description'" +
+                "content='If I write charset=UTF-8 here, it doesnt mean the page is in UTF-8'/>" +
+                "<meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void insideTag() throws IOException {
+        assertWindows1252("<tag " +
+                "attribute=\"<meta charset='UTF-8'>\" " + // inside attribute
+                "<meta charset='UTF-8' " + // still inside tag
+                "/>" + // tag end
+                "<meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void missingAttribute() throws IOException {
+        assertWindows1252(
+                "<meta content='charset=UTF-8'>" + // missing http-equiv attribute
+                        "<meta charset='WINDOWS-1252'>" // valid declaration
+        );
+    }
+
+    @Test
+    public void insideSpecialTag() throws IOException {
+        // Content inside <?, <!, and </ should be ignored
+        for (byte b : "?!/".getBytes(StandardCharsets.US_ASCII))
+            assertWindows1252(
+                    "<" + (char) b + // start comment
+                            "<meta charset='UTF-8'>" + // inside special tag
+                            "<meta charset='WINDOWS-1252'>" // real charset declaration
+            );
+    }
+
+    @Test
+    public void spaceBeforeTag() throws IOException {
+        assertWindows1252(
+                "< meta charset='UTF-8'>" + // invalid charset declaration
+                        "<meta charset='WINDOWS-1252'>" // real charset declaration
+        );
+    }
+
+    @Test
+    public void invalidAttribute() throws IOException {
+        assertWindows1252(
+                "<meta " +
+                        "badcharset='UTF-8' " + // invalid charset declaration
+                        "charset='WINDOWS-1252'>" // real charset declaration
+        );
+    }
+
+    @Test
+    public void unmatchedQuote() throws IOException {
+        assertWindows1252(
+                "<meta http-equiv='content-type' content='charset=\"UTF-8'>" + // invalid charset declaration
+                        "<meta charset='WINDOWS-1252'>" // real charset declaration
+        );
+    }
+
+    @Test
+    public void realWorld() throws IOException {
+        assertWindows1252("<!DOCTYPE html>\n" +
+                "<html lang=\"fr\">\n" +
+                "<head>\n" +
+                "<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':\n" +
+                "\t\t\tnew Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],\n" +
+                "\t\t\tj=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=\n" +
+                "\t\t\t'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);\n" +
+                "\t\t\t})(window,document,'script','dataLayer','GTM-PNX8H8X');</script>\n" +
+                "<title>Horaires Transilien 2018 - Lignes A B C D E H J K L N P R U</title>\n" +
+                "<meta name=\"description\" content=\"Consultez les horaires du Transilien en temps réel. Lignes A et B du RER. Lignes C D E H J K L N P R U du Transilien.\">\n" +
+                "<meta name=\"keywords\" content=\"horaires transilien\">\n" +
+                "<meta charset=\"windows-1252\">\n" +
+                "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
+                "<meta name=\"robots\" content=\"follow, index\">\n" +
+                "<base hr");
+    }
+
+    @Test
+    public void withCompactComment() throws IOException {
+        // <!--> is a valid comment
+        assertWindows1252(
+                "<!--" + // start comment
+                        "<meta charset='UTF-8'>" + // inside comment
+                        "-->" + // end comment
+                        "<!-->" + // compact comment
+                        "<meta charset='WINDOWS-1252'>" // outside comment, charset declaration
+        );
+    }
+
+    @Test
+    public void withUserProvidedCharset() throws IOException {
+        metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
+        // ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level
+        assertWindows1252("");
+        assertWindows1252("<meta charset='UTF-8'>");
+        assertWindows1252("<meta http-equiv='content-type' content='charset=utf-8'>");
+        // if a BOM is present, it has precedence over transport layer information
+        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
+        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
+        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE);
+    }
+
+    @Test
+    public void throwResistance() throws IOException {
+        // The preprocessing should return right after having found the charset
+        // So if an error is thrown in the stream AFTER the declaration,
+        // it shouldn't see it
+        assertWindows1252(throwAfter("<meta charset='WINDOWS-1252'>"));
+        assertWindows1252(throwAfter("<meta charset='WINDOWS-1252'><some other tag"));
+
+        // But if an error is thrown before the end of the meta tag, it should see it
+        // and return unsuccessfully
+        assertCharset(throwAfter("<meta charset='WINDOWS-1252'"), null);
+
+        // If there is no meta, but an error is thrown, the detector simply returns
+        // unsuccessfully (it should not throw runtime errors)
+        assertCharset(throwAfter("<"), null);
+        assertCharset(throwAfter("<!"), null);
+        assertCharset(throwAfter("<!doctype"), null);
+        assertCharset(throwAfter("<!doctype html><html"), null);
+        assertCharset(throwAfter("<!doctype html><html attr"), null);
+        assertCharset(throwAfter("<!doctype html><html attr="), null);
+        assertCharset(throwAfter("<!doctype html><html attr=x"), null);
+        assertCharset(throwAfter("<!doctype html><html attr='x"), null);
+    }
+
+    private void assertWindows1252(String html) throws IOException {
+        assertCharset(html, Charset.forName("WINDOWS-1252"));
+    }
+
+    private void assertWindows1252(InputStream inStream) throws IOException {
+        assertCharset(inStream, Charset.forName("WINDOWS-1252"));
+    }
+
+    private void assertCharset(String html, Charset charset) throws IOException {
+        final Charset contentsCharset = (charset == null) ? StandardCharsets.UTF_8 : charset;
+        InputStream inStream = new ByteArrayInputStream(html.getBytes(contentsCharset));
+        final Charset detected = detectCharset(inStream);
+        assertEquals(html + " should be detected as " + charset, charset, detected);
+    }
+
+    private void assertCharset(InputStream inStream, Charset charset) throws IOException {
+        final Charset detected = detectCharset(inStream);
+        assertEquals(charset, detected);
+    }
+
+    private Charset detectCharset(InputStream inStream) throws IOException {
+        return new StrictHtmlEncodingDetector().detect(inStream, metadata);
+    }
+
+    private InputStream throwAfter(String html) {
+        byte[] contents = html.getBytes(StandardCharsets.UTF_8);
+        InputStream contentsInStream = new ByteArrayInputStream(contents);
+        InputStream errorThrowing = new InputStream() {
+            @Override
+            public int read() throws IOException {
+                throw new IOException("test exception");
+            }
+        };
+        return new SequenceInputStream(contentsInStream, errorThrowing);
+    }
+}

[tika] 01/02: improve htmlparser

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 66417f619ba430f8c9e8ab0d903ebfecf936b071
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Jul 6 11:09:27 2018 -0400

    improve htmlparser
---
 .../java/org/apache/tika/io/TikaInputStream.java   | 11 +++++++++-
 .../org/apache/tika/parser/html/HtmlParser.java    | 25 ++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 785acc7..96f922f 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -502,6 +502,8 @@ public class TikaInputStream extends TaggedInputStream {
      */
     private Object openContainer;
 
+    private int consecutiveEOFs = 0;
+
     /**
      * Creates a TikaInputStream instance. This private constructor is used
      * by the static factory methods based on the available information.
@@ -718,6 +720,7 @@ public class TikaInputStream extends TaggedInputStream {
         super.reset();
         position = mark;
         mark = -1;
+        consecutiveEOFs = 0;
     }
 
     @Override
@@ -735,9 +738,15 @@ public class TikaInputStream extends TaggedInputStream {
     }
 
     @Override
-    protected void afterRead(int n) {
+    protected void afterRead(int n) throws IOException {
         if (n != -1) {
             position += n;
+        } else {
+            consecutiveEOFs++;
+            if (consecutiveEOFs > 1000) {
+                throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." +
+                        "If you think your file is not corrupt, please open an issue on Tika's JIRA");
+            }
         }
     }
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
index a1ef0da..adf591a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
@@ -29,6 +29,8 @@ import org.apache.tika.config.Field;
 import org.apache.tika.detect.AutoDetectReader;
 import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractEncodingDetectorParser;
@@ -40,6 +42,8 @@ import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
+import javax.swing.text.AbstractDocument;
+
 /**
  * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
  * and post-processes the events to produce XHTML and metadata expected by
@@ -90,6 +94,27 @@ public class HtmlParser extends AbstractEncodingDetectorParser {
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
 
+        TemporaryResources tmp = null;
+        try {
+            if (!TikaInputStream.isTikaInputStream(stream)) {
+                tmp = new TemporaryResources();
+                stream = TikaInputStream.get(stream, tmp);
+            }
+            //AutoDetectReader can throw exceptions during
+            //initialization.  If we just created a
+            //TemporaryResources, we need to make sure to close it.
+            parseImpl(stream, handler, metadata, context);
+        } finally {
+            if (tmp != null) {
+                tmp.close();
+            }
+        }
+
+    }
+
+
+    private void parseImpl(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
         // Automatically detect the character encoding
         try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
                 metadata, getEncodingDetector(context))) {