You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/07/06 15:53:34 UTC
[tika] branch master updated (c9a81a4 -> 790c124)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.
from c9a81a4 TIKA-2675 -- OpenDocumentParser should fail on invalid zip via Sebastian Nagel and PR-240.
new 66417f6 improve htmlparser
new 790c124 TIKA-2673 -- add StrictHtmlEncodingDetector, contributed by Gerard Bouchar
The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
.../java/org/apache/tika/io/TikaInputStream.java | 11 +-
.../org/apache/tika/parser/html/HtmlParser.java | 25 ++
.../parser/html/StrictHtmlEncodingDetector.java | 491 +++++++++++++++++++++
.../tika/parser/html/whatwg-encoding-labels.tsv | 234 ++++++++++
.../html/StrictHtmlEncodingDetectorTest.java | 300 +++++++++++++
5 files changed, 1060 insertions(+), 1 deletion(-)
create mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
create mode 100644 tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv
create mode 100644 tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java
[tika] 02/02: TIKA-2673 -- add StrictHtmlEncodingDetector,
contributed by Gerard Bouchar
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 790c1248207371e6cb2a3e7a1ec3a021503ec7a4
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Jul 6 11:53:24 2018 -0400
TIKA-2673 -- add StrictHtmlEncodingDetector, contributed by Gerard Bouchar
---
.../parser/html/StrictHtmlEncodingDetector.java | 491 +++++++++++++++++++++
.../tika/parser/html/whatwg-encoding-labels.tsv | 234 ++++++++++
.../html/StrictHtmlEncodingDetectorTest.java | 300 +++++++++++++
3 files changed, 1025 insertions(+)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
new file mode 100644
index 0000000..487f747
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
@@ -0,0 +1,491 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static java.nio.charset.StandardCharsets.*;
+import static org.apache.tika.parser.html.StrictHtmlEncodingDetector.SequenceMatcher.caseInsensitive;
+import static org.apache.tika.parser.html.StrictHtmlEncodingDetector.SingleByteMatcher.matchers;
+
+/**
+ * This is a strict html encoding detector that enforces the standard
+ * far more strictly than the HtmlEncodingDetector.
+ */
+public class StrictHtmlEncodingDetector implements EncodingDetector {
+ private static final String CHARSET_LABEL_FILE = "whatwg-encoding-labels.tsv";
+ private static Map<String, Charset> CHARSET_LABELS = getCharsetLabels();
+
+ private static Map<String, Charset> getCharsetLabels() {
+ String path = StrictHtmlEncodingDetector.class.getPackage().getName().replace('.', '/');
+ String filename = '/' + path + '/' + CHARSET_LABEL_FILE;
+ InputStream inputStream = StrictHtmlEncodingDetector.class.getResourceAsStream(filename);
+ Objects.requireNonNull(inputStream, "Missing charset label mapping file : " + filename);
+ try (BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.US_ASCII))) {
+ return buffer.lines()
+ .filter(s -> !s.startsWith("#"))
+ .map(s -> s.split("\t"))
+ .filter(parts -> parts.length >= 2)
+ .collect(Collectors.toMap(
+ parts -> parts[0],
+ StrictHtmlEncodingDetector::charsetFromStandard
+ ));
+ } catch (IOException e) {
+ throw new UncheckedIOException("Unable to read the charset label mapping", e);
+ }
+ }
+
+ private static Charset charsetFromStandard(String[] names) {
+ for (int i = 1; i < names.length; i++) {
+ try {
+ return Charset.forName(names[1]);
+ } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {/* pass */}
+ }
+ // The only single-byte charset extended charset that must be present on every Java platform
+ return StandardCharsets.ISO_8859_1;
+ }
+
+ private static Charset getCharsetByLabel(String label) {
+ if (label == null) return null;
+ label = label.trim().toLowerCase(Locale.US);
+ return CHARSET_LABELS.get(label);
+ }
+
+ @Override
+ public Charset detect(InputStream input, Metadata metadata) throws IOException {
+ PreScanner preScanner = new PreScanner(input);
+
+ // If there is a BOM at the beginning, the detection does not go further
+ Charset bomCharset = preScanner.detectBOM();
+ if (bomCharset != null) return bomCharset;
+
+ // Assume that if there was a charset specified either by the end user or the transport level,
+ // it was stored in the metadata
+ String incomingCharsetName = metadata.get(Metadata.CONTENT_ENCODING);
+ if (incomingCharsetName != null) {
+ Charset incomingCharset = getCharsetByLabel(incomingCharsetName);
+ if (incomingCharset != null) return incomingCharset;
+ }
+
+ return preScanner.scan();
+ }
+
+ static class PreScanner {
+
+ private static final Pattern META_CHARSET_PATTERN = Pattern.compile("charset\\s*=\\s*([\"']?)([^\"'\\s;]+)\\1");
+ private static ByteMatcher COMMENT_START = new SequenceMatcher("<!--");
+ private static ByteMatcher COMMENT_END = new SequenceMatcher("-->");
+ private static ByteMatcher LETTER = new OrMatcher(
+ new RangeMatcher((byte) 'a', (byte) 'z'),
+ new RangeMatcher((byte) 'A', (byte) 'Z')
+ );
+ private static ByteMatcher SPACE = new OrMatcher(matchers(0x09, 0x0A, 0x0C, 0x0D, 0x20));
+ private static ByteMatcher SLASH = new SingleByteMatcher((byte) '/');
+ private static ByteMatcher EQUAL = new SingleByteMatcher((byte) '=');
+ private static ByteMatcher TAG_END = new SingleByteMatcher((byte) '>');
+ private static ByteMatcher SINGLE_QUOTE = new SingleByteMatcher((byte) '\'');
+ private static ByteMatcher DOUBLE_QUOTE = new SingleByteMatcher((byte) '"');
+ private static ByteMatcher QUOTE = new OrMatcher(SINGLE_QUOTE, DOUBLE_QUOTE);
+ private static ByteMatcher TAG_END_OR_SLASH = new OrMatcher(SLASH, TAG_END);
+ private static ByteMatcher SPACE_OR_SLASH = new OrMatcher(SPACE, SLASH);
+ private static ByteMatcher SPACE_OR_TAG_END = new OrMatcher(SPACE, TAG_END);
+ private static ByteMatcher META_START = new SequenceMatcher(caseInsensitive("<meta"), SPACE_OR_SLASH);
+ private static ByteMatcher TAG_START = new SequenceMatcher(
+ new SingleByteMatcher((byte) '<'),
+ new OrMatcher(SLASH, LETTER)
+ );
+ private static ByteMatcher TAG_BODY = new NegativeMatcher(new OrMatcher(SPACE, TAG_END));
+ private static ByteMatcher SPECIAL_TAG_START = new SequenceMatcher(
+ new SingleByteMatcher((byte) '<'),
+ new OrMatcher(matchers("!/?"))
+ );
+ private static ByteMatcher UTF8_BOM = new SequenceMatcher(matchers(0xEF, 0xBB, 0xBF));
+ private static ByteMatcher UTF16_BE_BOM = new SequenceMatcher(matchers(0xFE, 0xFF));
+ private static ByteMatcher UTF16_LE_BOM = new SequenceMatcher(matchers(0xFF, 0xFE));
+
+
+ PushbackInputStream stream;
+ private CharsetDetectionResult detectedCharset = new CharsetDetectionResult();
+
+ public PreScanner(InputStream inputStream) {
+ this.stream = new PushbackInputStream(inputStream, 32);
+ }
+
+ public Charset scan() {
+ while (processAtLeastOneByte()) {
+ if (detectedCharset.isFound()) {
+ return detectedCharset.getCharset();
+ }
+ }
+ return null;
+ }
+
+ private Charset detectBOM() {
+ try {
+ if (UTF8_BOM.matches(stream)) return StandardCharsets.UTF_8;
+ else if (UTF16_BE_BOM.matches(stream)) return StandardCharsets.UTF_16BE;
+ else if (UTF16_LE_BOM.matches(stream)) return StandardCharsets.UTF_16LE;
+ } catch (IOException e) { /* stream could not be read, also return null */ }
+ return null;
+ }
+
+ private boolean processAtLeastOneByte() {
+ try {
+ return processComment() ||
+ processMeta() ||
+ processTag() ||
+ processSpecialTag() ||
+ processAny();
+ } catch (IOException e) {
+ return false;
+ }
+ }
+
+ private boolean processAny() throws IOException {
+ int read = stream.read();
+ return read != -1;
+ }
+
+ private boolean hasBytes() throws IOException {
+ int read = stream.read();
+ if (read != -1) stream.unread(read);
+ return read != -1;
+ }
+
+ private boolean processComment() throws IOException {
+ if (COMMENT_START.matches(stream)) {
+ // The two '-' in the '-->' sequence can be the same as those in the '<!--' sequence.
+ stream.unread("--".getBytes(StandardCharsets.US_ASCII));
+ return COMMENT_END.advanceUntilMatches(stream);
+ }
+ return false;
+ }
+
+ private boolean processTag() throws IOException {
+ if (TAG_START.matches(stream)) {
+ TAG_BODY.skipAll(stream);
+ while (getAttribute() != null) {/*ignore the attribute*/}
+ return true;
+ }
+ return false;
+ }
+
+ private boolean processSpecialTag() throws IOException {
+ if (SPECIAL_TAG_START.matches(stream)) {
+ TAG_BODY.skipAll(stream);
+ return TAG_END.advanceUntilMatches(stream);
+ }
+ return false;
+ }
+
+ private boolean processMeta() throws IOException {
+ if (META_START.matches(stream)) {
+ Set<String> attributeNames = new HashSet<>();
+ boolean gotPragma = false;
+ Boolean needPragma = null;
+ CharsetDetectionResult charset = new CharsetDetectionResult();
+ while (hasBytes()) {
+ Attribute attribute = getAttribute();
+ if (attribute == null) break;
+ if (attributeNames.contains(attribute.getName())) continue;
+ attributeNames.add(attribute.getName());
+ switch (attribute.getName()) {
+ case "http-equiv":
+ if (attribute.getValue().equals("content-type"))
+ gotPragma = true;
+ break;
+ case "content":
+ String charsetName = getEncodingFromMeta(attribute.getValue());
+ if (!charset.isFound() && charsetName != null) {
+ charset.find(charsetName);
+ needPragma = true;
+ }
+ break;
+ case "charset":
+ charset.find(attribute.getValue());
+ needPragma = false;
+ break;
+ default: // Ignore non-charset related attributes
+ }
+ }
+ if (needPragma != null && !(needPragma && !gotPragma)) {
+ detectedCharset = charset;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private String getEncodingFromMeta(String attributeValue) {
+ Matcher matcher = META_CHARSET_PATTERN.matcher(attributeValue);
+ if (!matcher.find()) return null;
+ return matcher.group(2);
+ }
+
+ private Attribute getAttribute() throws IOException {
+ SPACE_OR_SLASH.skipAll(stream);
+ if (TAG_END.peekMatches(stream)) return null;
+ StringBuilder name = new StringBuilder();
+ while (!EQUAL.peekMatches(stream) || name.length() == 0) {
+ if (TAG_END_OR_SLASH.peekMatches(stream)) {
+ break;
+ } else if (SPACE.peekMatches(stream)) {
+ SPACE.skipAll(stream);
+ break;
+ } else {
+ name.append(getLowerCaseChar());
+ }
+ }
+
+ if (!EQUAL.matches(stream)) return new Attribute(name.toString(), "");
+ SPACE.skipAll(stream);
+
+ StringBuilder value = new StringBuilder();
+ byte[] quoteMatched = QUOTE.match(stream);
+ if (quoteMatched != null) {
+ char quote = (char) quoteMatched[0];
+ int nextChar = -1;
+ while (nextChar != quote) {
+ if (nextChar != -1) value.append((char) nextChar);
+ nextChar = getLowerCaseChar();
+ }
+ } else {
+ while (!SPACE_OR_TAG_END.peekMatches(stream)) {
+ value.append(getLowerCaseChar());
+ }
+ }
+ return new Attribute(name.toString(), value.toString());
+ }
+
+ private char getLowerCaseChar() throws IOException {
+ int nextPoint = stream.read();
+ if (nextPoint == -1) throw new IOException();
+ if (nextPoint >= 'A' && nextPoint <= 'Z') nextPoint += 0x20; // lowercase
+ return (char) nextPoint;
+ }
+ }
+
+ static class Attribute {
+ String name;
+ String value;
+
+ public Attribute(String name, String value) {
+ this.name = name;
+ this.value = value;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public String getValue() {
+ return value;
+ }
+ }
+
+ /**
+ * A detection may either not find a charset, find an invalid charset, or find a valid charset
+ */
+ static class CharsetDetectionResult {
+ private boolean found = false;
+ private Charset charset = null;
+
+ public CharsetDetectionResult() { /* default result: not found */}
+
+ public boolean isFound() {
+ return found;
+ }
+
+ public void find(String charsetName) {
+ this.found = true;
+ charsetName = charsetName.trim();
+ if ("x-user-defined".equals(charsetName)) charsetName = "windows-1252";
+ this.charset = getCharsetByLabel(charsetName);
+ // The specification states: If charset is a UTF-16 encoding, then set charset to UTF-8.
+ if (UTF_16LE.equals(charset) || UTF_16BE.equals(charset)) charset = UTF_8;
+ }
+
+ public Charset getCharset() {
+ // the result may be null even if found is true, in the case there is a charset specified,
+ // but it is invalid
+ return charset;
+ }
+ }
+
+ static abstract class ByteMatcher {
+
+ abstract byte[] match(PushbackInputStream pushbackInputStream) throws IOException;
+
+ boolean matches(PushbackInputStream pushbackInputStream) throws IOException {
+ return this.match(pushbackInputStream) != null;
+ }
+
+ boolean advanceUntilMatches(PushbackInputStream pushbackInputStream) throws IOException {
+ while (!this.matches(pushbackInputStream)) {
+ int nextByte = pushbackInputStream.read();
+ if (nextByte == -1) return false;
+ }
+ return true;
+ }
+
+ void skipAll(PushbackInputStream pushbackInputStream) throws IOException {
+ while (matches(pushbackInputStream)) {/* just skip the byte */}
+ }
+
+ public boolean peekMatches(PushbackInputStream pushbackInputStream) throws IOException {
+ byte[] matched = this.match(pushbackInputStream);
+ if (matched != null) pushbackInputStream.unread(matched);
+ return matched != null;
+ }
+ }
+
+ static class SingleByteMatcher extends ByteMatcher {
+ private byte b;
+
+ public SingleByteMatcher(byte b) {
+ this.b = b;
+ }
+
+ public static ByteMatcher[] matchers(String s) {
+ return matchers(s.chars());
+ }
+
+ public static ByteMatcher[] matchers(int... bytes) {
+ return matchers(IntStream.of(bytes));
+ }
+
+ public static ByteMatcher[] matchers(IntStream byteStream) {
+ return byteStream
+ .mapToObj(i -> new SingleByteMatcher((byte) i))
+ .toArray(ByteMatcher[]::new);
+ }
+
+ @Override
+ byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+ int read = pushbackInputStream.read();
+ if ((byte) read == b) return new byte[]{b};
+ if (read != -1) pushbackInputStream.unread(read);
+ return null;
+ }
+ }
+
+ static class SequenceMatcher extends ByteMatcher {
+ private ByteMatcher[] matchers;
+
+ public SequenceMatcher(ByteMatcher... matchers) {
+ this.matchers = matchers;
+ }
+
+ public SequenceMatcher(String s) {
+ this(matchers(s));
+ }
+
+ public static SequenceMatcher caseInsensitive(String s) {
+ ByteMatcher[] lowerMatchers = matchers(s.toLowerCase(Locale.US));
+ ByteMatcher[] upperMatchers = matchers(s.toUpperCase(Locale.US));
+ OrMatcher[] matchers = IntStream
+ .range(0, Math.min(lowerMatchers.length, upperMatchers.length))
+ .mapToObj(i -> new OrMatcher(lowerMatchers[i], upperMatchers[i]))
+ .toArray(OrMatcher[]::new);
+ return new SequenceMatcher(matchers);
+ }
+
+ @Override
+ byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+ ByteArrayOutputStream allMatched = new ByteArrayOutputStream();
+ for (ByteMatcher m : matchers) {
+ byte[] matched = m.match(pushbackInputStream);
+ if (matched == null) {
+ pushbackInputStream.unread(allMatched.toByteArray());
+ return null;
+ } else {
+ allMatched.write(matched);
+ }
+ }
+ return allMatched.toByteArray();
+ }
+ }
+
+ static class OrMatcher extends ByteMatcher {
+ private ByteMatcher[] matchers;
+
+ public OrMatcher(ByteMatcher... matchers) {
+ this.matchers = matchers;
+ }
+
+ @Override
+ byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+ for (ByteMatcher m : matchers) {
+ byte[] matched = m.match(pushbackInputStream);
+ if (matched != null) return matched;
+ }
+ return null;
+ }
+ }
+
+ static class NegativeMatcher extends ByteMatcher {
+ private ByteMatcher matcher;
+
+ public NegativeMatcher(ByteMatcher matcher) {
+ this.matcher = matcher;
+ }
+
+ @Override
+ byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+ byte[] matched = matcher.match(pushbackInputStream);
+ if (matched == null) {
+ int read = pushbackInputStream.read();
+ if (read == -1) return null;
+ return new byte[]{(byte) read};
+ } else {
+ pushbackInputStream.unread(matched);
+ return null;
+ }
+ }
+ }
+
+ static class RangeMatcher extends ByteMatcher {
+ private byte low;
+ private byte high;
+
+ public RangeMatcher(byte low, byte high) {
+ this.low = low;
+ this.high = high;
+ }
+
+
+ @Override
+ byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+ int read = pushbackInputStream.read();
+ if (read >= low && read <= high) return new byte[]{(byte) read};
+ if (read != -1) pushbackInputStream.unread(read);
+ return null;
+ }
+ }
+}
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv b/tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv
new file mode 100644
index 0000000..92ddecb
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv
@@ -0,0 +1,234 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.unicode-1-1-utf-8 UTF-8
+#
+# label encoding fallback
+utf-8 UTF-8
+utf8 UTF-8
+866 IBM866
+cp866 IBM866
+csibm866 IBM866
+ibm866 IBM866
+csisolatin2 ISO-8859-2
+iso-8859-2 ISO-8859-2
+iso-ir-101 ISO-8859-2
+iso8859-2 ISO-8859-2
+iso88592 ISO-8859-2
+iso_8859-2 ISO-8859-2
+iso_8859-2:1987 ISO-8859-2
+l2 ISO-8859-2
+latin2 ISO-8859-2
+csisolatin3 ISO-8859-3
+iso-8859-3 ISO-8859-3
+iso-ir-109 ISO-8859-3
+iso8859-3 ISO-8859-3
+iso88593 ISO-8859-3
+iso_8859-3 ISO-8859-3
+iso_8859-3:1988 ISO-8859-3
+l3 ISO-8859-3
+latin3 ISO-8859-3
+csisolatin4 ISO-8859-4
+iso-8859-4 ISO-8859-4
+iso-ir-110 ISO-8859-4
+iso8859-4 ISO-8859-4
+iso88594 ISO-8859-4
+iso_8859-4 ISO-8859-4
+iso_8859-4:1988 ISO-8859-4
+l4 ISO-8859-4
+latin4 ISO-8859-4
+csisolatincyrillic ISO-8859-5
+cyrillic ISO-8859-5
+iso-8859-5 ISO-8859-5
+iso-ir-144 ISO-8859-5
+iso8859-5 ISO-8859-5
+iso88595 ISO-8859-5
+iso_8859-5 ISO-8859-5
+iso_8859-5:1988 ISO-8859-5
+arabic ISO-8859-6
+asmo-708 ISO-8859-6
+csiso88596e ISO-8859-6
+csiso88596i ISO-8859-6
+csisolatinarabic ISO-8859-6
+ecma-114 ISO-8859-6
+iso-8859-6 ISO-8859-6
+iso-8859-6-e ISO-8859-6
+iso-8859-6-i ISO-8859-6
+iso-ir-127 ISO-8859-6
+iso8859-6 ISO-8859-6
+iso88596 ISO-8859-6
+iso_8859-6 ISO-8859-6
+iso_8859-6:1987 ISO-8859-6
+csisolatingreek ISO-8859-7
+ecma-118 ISO-8859-7
+elot_928 ISO-8859-7
+greek ISO-8859-7
+greek8 ISO-8859-7
+iso-8859-7 ISO-8859-7
+iso-ir-126 ISO-8859-7
+iso8859-7 ISO-8859-7
+iso88597 ISO-8859-7
+iso_8859-7 ISO-8859-7
+iso_8859-7:1987 ISO-8859-7
+sun_eu_greek ISO-8859-7
+csiso88598e ISO-8859-8
+csisolatinhebrew ISO-8859-8
+hebrew ISO-8859-8
+iso-8859-8 ISO-8859-8
+iso-8859-8-e ISO-8859-8
+iso-ir-138 ISO-8859-8
+iso8859-8 ISO-8859-8
+iso88598 ISO-8859-8
+iso_8859-8 ISO-8859-8
+iso_8859-8:1988 ISO-8859-8
+visual ISO-8859-8
+csiso88598i ISO-8859-8-I ISO-8859-8
+iso-8859-8-i ISO-8859-8-I ISO-8859-8
+logical ISO-8859-8-I ISO-8859-8
+csisolatin6 ISO-8859-10 ISO-8859-4
+iso-8859-10 ISO-8859-10 ISO-8859-4
+iso-ir-157 ISO-8859-10 ISO-8859-4
+iso8859-10 ISO-8859-10 ISO-8859-4
+iso885910 ISO-8859-10 ISO-8859-4
+l6 ISO-8859-10 ISO-8859-4
+latin6 ISO-8859-10 ISO-8859-4
+iso-8859-13 ISO-8859-13
+iso8859-13 ISO-8859-13
+iso885913 ISO-8859-13
+iso-8859-14 ISO-8859-14 ISO-8859-1
+iso8859-14 ISO-8859-14 ISO-8859-1
+iso885914 ISO-8859-14 ISO-8859-1
+csisolatin9 ISO-8859-15
+iso-8859-15 ISO-8859-15
+iso8859-15 ISO-8859-15
+iso885915 ISO-8859-15
+iso_8859-15 ISO-8859-15
+l9 ISO-8859-15
+iso-8859-16 ISO-8859-16 ISO-8859-1
+cskoi8r KOI8-R
+koi KOI8-R
+koi8 KOI8-R
+koi8-r KOI8-R
+koi8_r KOI8-R
+koi8-ru KOI8-U
+koi8-u KOI8-U
+csmacintosh x-MacRoman
+mac x-MacRoman
+macintosh x-MacRoman
+x-mac-roman x-MacRoman
+dos-874 windows-874
+iso-8859-11 windows-874
+iso8859-11 windows-874
+iso885911 windows-874
+tis-620 windows-874
+windows-874 windows-874
+cp1250 windows-1250
+windows-1250 windows-1250
+x-cp1250 windows-1250
+cp1251 windows-1251
+windows-1251 windows-1251
+x-cp1251 windows-1251
+ansi_x3.4-1968 windows-1252
+ascii windows-1252
+cp1252 windows-1252
+cp819 windows-1252
+csisolatin1 windows-1252
+ibm819 windows-1252
+iso-8859-1 windows-1252
+iso-ir-100 windows-1252
+iso8859-1 windows-1252
+iso88591 windows-1252
+iso_8859-1 windows-1252
+iso_8859-1:1987 windows-1252
+l1 windows-1252
+latin1 windows-1252
+us-ascii windows-1252
+windows-1252 windows-1252
+x-cp1252 windows-1252
+cp1253 windows-1253
+windows-1253 windows-1253
+x-cp1253 windows-1253
+cp1254 windows-1254
+csisolatin5 windows-1254
+iso-8859-9 windows-1254
+iso-ir-148 windows-1254
+iso8859-9 windows-1254
+iso88599 windows-1254
+iso_8859-9 windows-1254
+iso_8859-9:1989 windows-1254
+l5 windows-1254
+latin5 windows-1254
+windows-1254 windows-1254
+x-cp1254 windows-1254
+cp1255 windows-1255
+windows-1255 windows-1255
+x-cp1255 windows-1255
+cp1256 windows-1256
+windows-1256 windows-1256
+x-cp1256 windows-1256
+cp1257 windows-1257
+windows-1257 windows-1257
+x-cp1257 windows-1257
+cp1258 windows-1258
+windows-1258 windows-1258
+x-cp1258 windows-1258
+x-mac-cyrillic x-MacCyrillic
+x-mac-ukrainian x-MacCyrillic
+chinese GBK
+csgb2312 GBK
+csiso58gb231280 GBK
+gb2312 GBK
+gb_2312 GBK
+gb_2312-80 GBK
+gbk GBK
+iso-ir-58 GBK
+x-gbk GBK
+gb18030 gb18030
+big5 Big5
+big5-hkscs Big5
+cn-big5 Big5
+csbig5 Big5
+x-x-big5 Big5
+cseucpkdfmtjapanese EUC-JP
+euc-jp EUC-JP
+x-euc-jp EUC-JP
+csiso2022jp ISO-2022-JP
+iso-2022-jp ISO-2022-JP
+csshiftjis Shift_JIS
+ms932 Shift_JIS
+ms_kanji Shift_JIS
+shift-jis Shift_JIS
+shift_jis Shift_JIS
+sjis Shift_JIS
+windows-31j Shift_JIS
+x-sjis Shift_JIS
+cseuckr EUC-KR
+csksc56011987 EUC-KR
+euc-kr EUC-KR
+iso-ir-149 EUC-KR
+korean EUC-KR
+ks_c_5601-1987 EUC-KR
+ks_c_5601-1989 EUC-KR
+ksc5601 EUC-KR
+ksc_5601 EUC-KR
+windows-949 EUC-KR
+csiso2022kr replacement
+hz-gb-2312 replacement
+iso-2022-cn replacement
+iso-2022-cn-ext replacement
+iso-2022-kr replacement
+replacement replacement
+utf-16be UTF-16BE
+utf-16 UTF-16LE
+utf-16le UTF-16LE
+x-user-defined x-user-defined
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java
new file mode 100644
index 0000000..1c0da8d
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java
@@ -0,0 +1,300 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.html;
+
+
+import org.apache.tika.metadata.Metadata;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.Assert.assertEquals;
+
+public class StrictHtmlEncodingDetectorTest {
+ private Metadata metadata = new Metadata();
+
+ @Before
+ public void setUp() {
+ this.metadata = new Metadata();
+ }
+
+ @Test
+ public void basic() throws IOException {
+ assertWindows1252("<meta charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ public void duplicateMeta() throws IOException {
+ assertWindows1252("<meta charset='WINDOWS-1252'>" +
+ "<meta charset='UTF-8'>");
+ }
+
+ @Test
+ public void httpEquiv() throws IOException {
+ assertWindows1252("<meta " +
+ "http-equiv='content-type' " +
+ "content='text/html; charset=\"WINDOWS-1252\"'>"); // quotes around the charset are allowed
+ assertWindows1252("<meta " +
+ "content=' charset = WINDOWS-1252' " + // The charset may be anywhere in the content attribute
+ "http-equiv='content-type' >");
+ }
+
+ @Test
+ public void httpEquivDuplicateCharset() throws IOException {
+ assertWindows1252("<meta " +
+ "http-equiv='content-type' " +
+ "content='charset=WINDOWS-1252;" + // The detection should stop after the semicolon
+ "charset=UTF-8'>");
+ }
+
+ @Test
+ public void htmlFragment() throws IOException {
+ assertWindows1252("<!doctype html><html class=nojs><head><meta charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ public void verBadHtml() throws IOException {
+ // check that the parser is not confused by garbage before the declaration
+ assertWindows1252("<< l \" == / '=x\n >" +
+ "<!--> " +
+ "< <x'/ <=> " +
+ "<meta/>" +
+ "<a x/>" +
+ "<meta charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ public void incompleteMeta() throws IOException {
+ assertWindows1252("<meta charset='WINDOWS-1252'"); // missing '>' at the end
+ }
+
+ @Test
+ public void charsetWithWhiteSpaces() throws IOException {
+ assertWindows1252("<meta charset=' \t\n WINDOWS-1252 \t\n'>");
+ }
+
+ @Test
+ public void mixedCase() throws IOException {
+ assertWindows1252("<mEtA chArsEt='WInDOWs-1252'>");
+ }
+
+ @Test
+ public void utf16() throws IOException {
+ // According to the specification 'If charset is a UTF-16 encoding, then set charset to UTF-8.'
+ assertCharset("<meta charset='UTF-16BE'>", StandardCharsets.UTF_8);
+ }
+
+ @Test
+ public void xUserDefined() throws IOException {
+ // According to the specification 'If charset is x-user-defined, then set charset to windows-1252.'
+ assertWindows1252("<meta charset='x-user-defined'>");
+ }
+
+ @Test
+ public void iso88591() throws IOException {
+ // In the spec, iso-8859-1 is an alias for WINDOWS-1252
+ assertWindows1252("<meta charset='iso-8859-1'>");
+ }
+
+ @Test
+ public void macintoshEncoding() throws IOException {
+ // The mac roman encoding exists in java, but under the name x-MacRoman
+ assertCharset("<meta charset='macintosh'>", Charset.forName("x-MacRoman"));
+ }
+
+ @Test
+ public void bom() throws IOException {
+ // A BOM should have precedence over the meta
+ assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
+ assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
+ assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE);
+ }
+
+ @Test
+ public void withSlash() throws IOException {
+ assertWindows1252("<meta/charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ public void insideDescription() throws IOException {
+ assertWindows1252("<meta name='description'" +
+ "content='If I write charset=UTF-8 here, it doesnt mean the page is in UTF-8'/>" +
+ "<meta charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ public void insideTag() throws IOException {
+ assertWindows1252("<tag " +
+ "attribute=\"<meta charset='UTF-8'>\" " + // inside attribute
+ "<meta charset='UTF-8' " + // still inside tag
+ "/>" + // tag end
+ "<meta charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ public void missingAttribute() throws IOException {
+ assertWindows1252(
+ "<meta content='charset=UTF-8'>" + // missing http-equiv attribute
+ "<meta charset='WINDOWS-1252'>" // valid declaration
+ );
+ }
+
+ @Test
+ public void insideSpecialTag() throws IOException {
+ // Content inside <?, <!, and </ should be ignored
+ for (byte b : "?!/".getBytes(StandardCharsets.US_ASCII))
+ assertWindows1252(
+ "<" + (char) b + // start comment
+ "<meta charset='UTF-8'>" + // inside special tag
+ "<meta charset='WINDOWS-1252'>" // real charset declaration
+ );
+ }
+
+ @Test
+ public void spaceBeforeTag() throws IOException {
+ assertWindows1252(
+ "< meta charset='UTF-8'>" + // invalid charset declaration
+ "<meta charset='WINDOWS-1252'>" // real charset declaration
+ );
+ }
+
+ @Test
+ public void invalidAttribute() throws IOException {
+ assertWindows1252(
+ "<meta " +
+ "badcharset='UTF-8' " + // invalid charset declaration
+ "charset='WINDOWS-1252'>" // real charset declaration
+ );
+ }
+
+ @Test
+ public void unmatchedQuote() throws IOException {
+ assertWindows1252(
+ "<meta http-equiv='content-type' content='charset=\"UTF-8'>" + // invalid charset declaration
+ "<meta charset='WINDOWS-1252'>" // real charset declaration
+ );
+ }
+
+ @Test
+ public void realWorld() throws IOException {
+ assertWindows1252("<!DOCTYPE html>\n" +
+ "<html lang=\"fr\">\n" +
+ "<head>\n" +
+ "<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':\n" +
+ "\t\t\tnew Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],\n" +
+ "\t\t\tj=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=\n" +
+ "\t\t\t'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);\n" +
+ "\t\t\t})(window,document,'script','dataLayer','GTM-PNX8H8X');</script>\n" +
+ "<title>Horaires Transilien 2018 - Lignes A B C D E H J K L N P R U</title>\n" +
+ "<meta name=\"description\" content=\"Consultez les horaires du Transilien en temps réel. Lignes A et B du RER. Lignes C D E H J K L N P R U du Transilien.\">\n" +
+ "<meta name=\"keywords\" content=\"horaires transilien\">\n" +
+ "<meta charset=\"windows-1252\">\n" +
+ "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
+ "<meta name=\"robots\" content=\"follow, index\">\n" +
+ "<base hr");
+ }
+
+ @Test
+ public void withCompactComment() throws IOException {
+ // <!--> is a valid comment
+ assertWindows1252(
+ "<!--" + // start comment
+ "<meta charset='UTF-8'>" + // inside comment
+ "-->" + // end comment
+ "<!-->" + // compact comment
+ "<meta charset='WINDOWS-1252'>" // outside comment, charset declaration
+ );
+ }
+
+ @Test
+ public void withUserProvidedCharset() throws IOException {
+ metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
+ // ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level
+ assertWindows1252("");
+ assertWindows1252("<meta charset='UTF-8'>");
+ assertWindows1252("<meta http-equiv='content-type' content='charset=utf-8'>");
+ // if a BOM is present, it has precedence over transport layer information
+ assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
+ assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
+ assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE);
+ }
+
+ @Test
+ public void throwResistance() throws IOException {
+ // The preprocessing should return right after having found the charset
+ // So if an error is thrown in the stream AFTER the declaration,
+ // it shouldn't see it
+ assertWindows1252(throwAfter("<meta charset='WINDOWS-1252'>"));
+ assertWindows1252(throwAfter("<meta charset='WINDOWS-1252'><some other tag"));
+
+ // But if an error is thrown before the end of the meta tag, it should see it
+ // and return unsuccessfully
+ assertCharset(throwAfter("<meta charset='WINDOWS-1252'"), null);
+
+ // If there is no meta, but an error is thrown, the detector simply returns
+ // unsuccessfully (it should not throw runtime errors)
+ assertCharset(throwAfter("<"), null);
+ assertCharset(throwAfter("<!"), null);
+ assertCharset(throwAfter("<!doctype"), null);
+ assertCharset(throwAfter("<!doctype html><html"), null);
+ assertCharset(throwAfter("<!doctype html><html attr"), null);
+ assertCharset(throwAfter("<!doctype html><html attr="), null);
+ assertCharset(throwAfter("<!doctype html><html attr=x"), null);
+ assertCharset(throwAfter("<!doctype html><html attr='x"), null);
+ }
+
+ private void assertWindows1252(String html) throws IOException {
+ assertCharset(html, Charset.forName("WINDOWS-1252"));
+ }
+
+ private void assertWindows1252(InputStream inStream) throws IOException {
+ assertCharset(inStream, Charset.forName("WINDOWS-1252"));
+ }
+
+ private void assertCharset(String html, Charset charset) throws IOException {
+ final Charset contentsCharset = (charset == null) ? StandardCharsets.UTF_8 : charset;
+ InputStream inStream = new ByteArrayInputStream(html.getBytes(contentsCharset));
+ final Charset detected = detectCharset(inStream);
+ assertEquals(html + " should be detected as " + charset, charset, detected);
+ }
+
+ private void assertCharset(InputStream inStream, Charset charset) throws IOException {
+ final Charset detected = detectCharset(inStream);
+ assertEquals(charset, detected);
+ }
+
+ private Charset detectCharset(InputStream inStream) throws IOException {
+ return new StrictHtmlEncodingDetector().detect(inStream, metadata);
+ }
+
+ private InputStream throwAfter(String html) {
+ byte[] contents = html.getBytes(StandardCharsets.UTF_8);
+ InputStream contentsInStream = new ByteArrayInputStream(contents);
+ InputStream errorThrowing = new InputStream() {
+ @Override
+ public int read() throws IOException {
+ throw new IOException("test exception");
+ }
+ };
+ return new SequenceInputStream(contentsInStream, errorThrowing);
+ }
+}
[tika] 01/02: improve htmlparser
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 66417f619ba430f8c9e8ab0d903ebfecf936b071
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Jul 6 11:09:27 2018 -0400
improve htmlparser
---
.../java/org/apache/tika/io/TikaInputStream.java | 11 +++++++++-
.../org/apache/tika/parser/html/HtmlParser.java | 25 ++++++++++++++++++++++
2 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 785acc7..96f922f 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -502,6 +502,8 @@ public class TikaInputStream extends TaggedInputStream {
*/
private Object openContainer;
+ private int consecutiveEOFs = 0;
+
/**
* Creates a TikaInputStream instance. This private constructor is used
* by the static factory methods based on the available information.
@@ -718,6 +720,7 @@ public class TikaInputStream extends TaggedInputStream {
super.reset();
position = mark;
mark = -1;
+ consecutiveEOFs = 0;
}
@Override
@@ -735,9 +738,15 @@ public class TikaInputStream extends TaggedInputStream {
}
@Override
- protected void afterRead(int n) {
+ protected void afterRead(int n) throws IOException {
if (n != -1) {
position += n;
+ } else {
+ consecutiveEOFs++;
+ if (consecutiveEOFs > 1000) {
+ throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." +
+ "If you think your file is not corrupt, please open an issue on Tika's JIRA");
+ }
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
index a1ef0da..adf591a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
@@ -29,6 +29,8 @@ import org.apache.tika.config.Field;
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
@@ -40,6 +42,8 @@ import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import javax.swing.text.AbstractDocument;
+
/**
* HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
* and post-processes the events to produce XHTML and metadata expected by
@@ -90,6 +94,27 @@ public class HtmlParser extends AbstractEncodingDetectorParser {
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
+ TemporaryResources tmp = null;
+ try {
+ if (!TikaInputStream.isTikaInputStream(stream)) {
+ tmp = new TemporaryResources();
+ stream = TikaInputStream.get(stream, tmp);
+ }
+ //AutoDetectReader can throw exceptions during
+ //initialization. If we just created a
+ //TemporaryResources, we need to make sure to close it.
+ parseImpl(stream, handler, metadata, context);
+ } finally {
+ if (tmp != null) {
+ tmp.close();
+ }
+ }
+
+ }
+
+
+ private void parseImpl(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Automatically detect the character encoding
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
metadata, getEncodingDetector(context))) {