You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/12 18:11:53 UTC
(tika) branch main updated: TIKA-4195 -- jsoup parser shouldn't conceal backoff to default encoding (#1591)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 455409bf8 TIKA-4195 -- jsoup parser shouldn't conceal backoff to default encoding (#1591)
455409bf8 is described below
commit 455409bf80801152e7c855ddc994fedc32c4cfcf
Author: Tim Allison <ta...@apache.org>
AuthorDate: Mon Feb 12 13:11:47 2024 -0500
TIKA-4195 -- jsoup parser shouldn't conceal backoff to default encoding (#1591)
* TIKA-4195 -- jsoup parser conceals backoff to default encoding
---
.../org/apache/tika/detect/AutoDetectReader.java | 38 ++++++++++++----------
.../tika/detect/CompositeEncodingDetector.java | 7 ++++
.../apache/tika/metadata/TikaCoreProperties.java | 16 +++++++++
.../apache/tika/parser/html/HtmlParserTest.java | 2 +-
.../org/apache/tika/parser/txt/TXTParserTest.java | 2 ++
.../tika/parser/RecursiveParserWrapperTest.java | 5 +--
6 files changed, 49 insertions(+), 21 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
index 5cb920aae..bd7d4f2a9 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
@@ -22,8 +22,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
-import java.util.Collections;
-import java.util.List;
import org.xml.sax.InputSource;
@@ -31,6 +29,7 @@ import org.apache.tika.config.LoadErrorHandler;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.utils.CharsetUtils;
@@ -68,26 +67,27 @@ public class AutoDetectReader extends BufferedReader {
/**
* @param stream stream from which to read -- make sure that it supports mark!
* @param metadata
- * @param detectors
+ * @param detector
* @param handler
* @throws IOException
* @throws TikaException
*/
private AutoDetectReader(InputStream stream, Metadata metadata,
- List<EncodingDetector> detectors, LoadErrorHandler handler)
+ EncodingDetector detector, LoadErrorHandler handler)
throws IOException, TikaException {
- this(stream, detect(stream, metadata, detectors, handler));
+ this(stream, detect(stream, metadata, detector, handler));
}
public AutoDetectReader(InputStream stream, Metadata metadata,
EncodingDetector encodingDetector) throws IOException, TikaException {
- this(getBuffered(stream), metadata, Collections.singletonList(encodingDetector),
+ this(getBuffered(stream), metadata, encodingDetector,
DEFAULT_LOADER.getLoadErrorHandler());
}
public AutoDetectReader(InputStream stream, Metadata metadata, ServiceLoader loader)
throws IOException, TikaException {
- this(getBuffered(stream), metadata, loader.loadServiceProviders(EncodingDetector.class),
+ this(getBuffered(stream), metadata,
+ new CompositeEncodingDetector(loader.loadServiceProviders(EncodingDetector.class)),
loader.getLoadErrorHandler());
}
@@ -101,19 +101,17 @@ public class AutoDetectReader extends BufferedReader {
}
private static Charset detect(InputStream input, Metadata metadata,
- List<EncodingDetector> detectors, LoadErrorHandler handler)
+ EncodingDetector detector, LoadErrorHandler handler)
throws IOException, TikaException {
// Ask all given detectors for the character encoding
- for (EncodingDetector detector : detectors) {
- try {
- Charset charset = detector.detect(input, metadata);
- if (charset != null) {
- return charset;
- }
- } catch (NoClassDefFoundError e) {
- // TIKA-1041: Detector dependencies not present.
- handler.handleLoadError(detector.getClass().getName(), e);
+ try {
+ Charset charset = detector.detect(input, metadata);
+ if (charset != null) {
+ return charset;
}
+ } catch (NoClassDefFoundError e) {
+ // TIKA-1041: Detector dependencies not present.
+ handler.handleLoadError(detector.getClass().getName(), e);
}
// Try determining the encoding based on hints in document metadata
@@ -122,7 +120,11 @@ public class AutoDetectReader extends BufferedReader {
String charset = type.getParameters().get("charset");
if (charset != null) {
try {
- return CharsetUtils.forName(charset);
+ Charset cs = CharsetUtils.forName(charset);
+ metadata.set(TikaCoreProperties.DETECTED_ENCODING, cs.name());
+ metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+ "AutoDetectReader-charset-metadata-fallback");
+ return cs;
} catch (IllegalArgumentException e) {
// ignore
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
index a50b7e4bc..7db79ccc7 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
@@ -26,6 +26,7 @@ import java.util.LinkedList;
import java.util.List;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
public class CompositeEncodingDetector implements EncodingDetector, Serializable {
@@ -64,6 +65,12 @@ public class CompositeEncodingDetector implements EncodingDetector, Serializable
for (EncodingDetector detector : getDetectors()) {
Charset detected = detector.detect(input, metadata);
if (detected != null) {
+ metadata.set(TikaCoreProperties.DETECTED_ENCODING, detected.name());
+ //if this has been set by a leaf detector, do not overwrite
+ if (! detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) {
+ metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+ detector.getClass().getSimpleName());
+ }
return detected;
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index a75eb8acf..6ff02c1cf 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -322,6 +322,22 @@ public interface TikaCoreProperties {
//is the file encrypted
Property IS_ENCRYPTED = Property.internalBoolean(TIKA_META_PREFIX + "encrypted");
+ /**
+ * When an EncodingDetector detects an encoding, the encoding should be stored in this field.
+ * This is different from {@link Metadata#CONTENT_ENCODING} because that is what a parser
+ * chooses to use for processing a file. If an EncodingDetector returns "null", a parser
+ * may choose to use a default encoding. We want to differentiate between a parser using a
+ * default encoding and the output of an EncodingDetector.
+ */
+ Property DETECTED_ENCODING = Property.externalText(TIKA_META_PREFIX + "detectedEncoding");
+
+
+ /**
+ * This should be the simple class name for the EncodingDetectors whose detected encoding
+ * was used in the parse.
+ */
+ Property ENCODING_DETECTOR = Property.externalText(TIKA_META_PREFIX + "encodingDetector");
+
/**
* General metadata key for the count of non-final versions available within a file. This
* was added initially to support generalizing incremental updates in PDF.
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 279040bcf..502911fd3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -1055,7 +1055,7 @@ public class HtmlParserTest extends TikaTest {
}
assertEquals(1, (int) tagFrequencies.get("title"));
- assertEquals(9, (int) tagFrequencies.get("meta"));
+ assertEquals(11, (int) tagFrequencies.get("meta"));
assertEquals(12, (int) tagFrequencies.get("link"));
assertEquals(6, (int) tagFrequencies.get("script"));
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index 576d6f6d9..f39be16c4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -106,6 +106,8 @@ public class TXTParserTest extends TikaTest {
parser.parse(new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
new DefaultHandler(), metadata, new ParseContext());
assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("UniversalEncodingDetector", metadata.get(TikaCoreProperties.ENCODING_DETECTOR));
+ assertEquals("windows-1252", metadata.get(TikaCoreProperties.DETECTED_ENCODING));
metadata = new Metadata();
parser.parse(new ByteArrayInputStream(unix.getBytes("ISO-8859-15")), new DefaultHandler(),
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 61eeab14d..17b18646a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -148,8 +148,9 @@ public class RecursiveParserWrapperTest extends TikaTest {
assertEquals("true", list.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
- assertContains("dissolve the political", list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
- assertNotContained("them to the separation",
+ assertContains("necessary for one people",
+ list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
+ assertNotContained("dissolve the political",
list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
}