You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/12 17:33:28 UTC

(tika) 01/01: TIKA-4195 -- jsoup parser conceals backoff to default encoding

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4195
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 7480734379928d7d32a13aa0b56b9b00183a7773
Author: tallison <ta...@apache.org>
AuthorDate: Mon Feb 12 12:33:13 2024 -0500

    TIKA-4195 -- jsoup parser conceals backoff to default encoding
---
 .../org/apache/tika/detect/AutoDetectReader.java   | 38 ++++++++++++----------
 .../tika/detect/CompositeEncodingDetector.java     |  7 ++++
 .../apache/tika/metadata/TikaCoreProperties.java   | 16 +++++++++
 .../apache/tika/parser/html/HtmlParserTest.java    |  2 +-
 .../org/apache/tika/parser/txt/TXTParserTest.java  |  2 ++
 5 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
index 5cb920aae..bd7d4f2a9 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
@@ -22,8 +22,6 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.Charset;
-import java.util.Collections;
-import java.util.List;
 
 import org.xml.sax.InputSource;
 
@@ -31,6 +29,7 @@ import org.apache.tika.config.LoadErrorHandler;
 import org.apache.tika.config.ServiceLoader;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.utils.CharsetUtils;
 
@@ -68,26 +67,27 @@ public class AutoDetectReader extends BufferedReader {
     /**
      * @param stream    stream from which to read -- make sure that it supports mark!
      * @param metadata
-     * @param detectors
+     * @param detector
      * @param handler
      * @throws IOException
      * @throws TikaException
      */
     private AutoDetectReader(InputStream stream, Metadata metadata,
-                             List<EncodingDetector> detectors, LoadErrorHandler handler)
+                             EncodingDetector detector, LoadErrorHandler handler)
             throws IOException, TikaException {
-        this(stream, detect(stream, metadata, detectors, handler));
+        this(stream, detect(stream, metadata, detector, handler));
     }
 
     public AutoDetectReader(InputStream stream, Metadata metadata,
                             EncodingDetector encodingDetector) throws IOException, TikaException {
-        this(getBuffered(stream), metadata, Collections.singletonList(encodingDetector),
+        this(getBuffered(stream), metadata, encodingDetector,
                 DEFAULT_LOADER.getLoadErrorHandler());
     }
 
     public AutoDetectReader(InputStream stream, Metadata metadata, ServiceLoader loader)
             throws IOException, TikaException {
-        this(getBuffered(stream), metadata, loader.loadServiceProviders(EncodingDetector.class),
+        this(getBuffered(stream), metadata,
+                new CompositeEncodingDetector(loader.loadServiceProviders(EncodingDetector.class)),
                 loader.getLoadErrorHandler());
     }
 
@@ -101,19 +101,17 @@ public class AutoDetectReader extends BufferedReader {
     }
 
     private static Charset detect(InputStream input, Metadata metadata,
-                                  List<EncodingDetector> detectors, LoadErrorHandler handler)
+                                  EncodingDetector detector, LoadErrorHandler handler)
             throws IOException, TikaException {
         // Ask all given detectors for the character encoding
-        for (EncodingDetector detector : detectors) {
-            try {
-                Charset charset = detector.detect(input, metadata);
-                if (charset != null) {
-                    return charset;
-                }
-            } catch (NoClassDefFoundError e) {
-                // TIKA-1041: Detector dependencies not present.
-                handler.handleLoadError(detector.getClass().getName(), e);
+        try {
+            Charset charset = detector.detect(input, metadata);
+            if (charset != null) {
+                return charset;
             }
+        } catch (NoClassDefFoundError e) {
+            // TIKA-1041: Detector dependencies not present.
+            handler.handleLoadError(detector.getClass().getName(), e);
         }
 
         // Try determining the encoding based on hints in document metadata
@@ -122,7 +120,11 @@ public class AutoDetectReader extends BufferedReader {
             String charset = type.getParameters().get("charset");
             if (charset != null) {
                 try {
-                    return CharsetUtils.forName(charset);
+                    Charset cs = CharsetUtils.forName(charset);
+                    metadata.set(TikaCoreProperties.DETECTED_ENCODING, cs.name());
+                    metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+                            "AutoDetectReader-charset-metadata-fallback");
+                    return cs;
                 } catch (IllegalArgumentException e) {
                     // ignore
                 }
diff --git a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
index a50b7e4bc..7db79ccc7 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
@@ -26,6 +26,7 @@ import java.util.LinkedList;
 import java.util.List;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 
 public class CompositeEncodingDetector implements EncodingDetector, Serializable {
 
@@ -64,6 +65,12 @@ public class CompositeEncodingDetector implements EncodingDetector, Serializable
         for (EncodingDetector detector : getDetectors()) {
             Charset detected = detector.detect(input, metadata);
             if (detected != null) {
+                metadata.set(TikaCoreProperties.DETECTED_ENCODING, detected.name());
+                //if this has been set by a leaf detector, do not overwrite
+                if (! detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) {
+                    metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+                            detector.getClass().getSimpleName());
+                }
                 return detected;
             }
         }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index a75eb8acf..6ff02c1cf 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -322,6 +322,22 @@ public interface TikaCoreProperties {
     //is the file encrypted
     Property IS_ENCRYPTED = Property.internalBoolean(TIKA_META_PREFIX + "encrypted");
 
+    /**
+     * When an EncodingDetector detects an encoding, the encoding should be stored in this field.
+     * This is different from {@link Metadata#CONTENT_ENCODING} because that is what a parser
+     * chooses to use for processing a file. If an EncodingDetector returns "null", a parser
+     * may choose to use a default encoding. We want to differentiate between a parser using a
+     * default encoding and the output of an EncodingDetector.
+     */
+    Property DETECTED_ENCODING = Property.externalText(TIKA_META_PREFIX + "detectedEncoding");
+
+
+    /**
+     * This should be the simple class name for the EncodingDetectors whose detected encoding
+     * was used in the parse.
+     */
+    Property ENCODING_DETECTOR = Property.externalText(TIKA_META_PREFIX + "encodingDetector");
+
     /**
      * General metadata key for the count of non-final versions available within a file.  This
      * was added initially to support generalizing incremental updates in PDF.
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 279040bcf..502911fd3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -1055,7 +1055,7 @@ public class HtmlParserTest extends TikaTest {
         }
 
         assertEquals(1, (int) tagFrequencies.get("title"));
-        assertEquals(9, (int) tagFrequencies.get("meta"));
+        assertEquals(11, (int) tagFrequencies.get("meta"));
         assertEquals(12, (int) tagFrequencies.get("link"));
         assertEquals(6, (int) tagFrequencies.get("script"));
     }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index 576d6f6d9..f39be16c4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -106,6 +106,8 @@ public class TXTParserTest extends TikaTest {
         parser.parse(new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
                 new DefaultHandler(), metadata, new ParseContext());
         assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("UniversalEncodingDetector", metadata.get(TikaCoreProperties.ENCODING_DETECTOR));
+        assertEquals("windows-1252", metadata.get(TikaCoreProperties.DETECTED_ENCODING));
 
         metadata = new Metadata();
         parser.parse(new ByteArrayInputStream(unix.getBytes("ISO-8859-15")), new DefaultHandler(),