You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/12 17:33:27 UTC

(tika) branch TIKA-4195 created (now 748073437)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4195
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 748073437 TIKA-4195 -- jsoup parser conceals backoff to default encoding

This branch includes the following new commits:

     new 748073437 TIKA-4195 -- jsoup parser conceals backoff to default encoding

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



(tika) 01/01: TIKA-4195 -- jsoup parser conceals backoff to default encoding

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4195
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 7480734379928d7d32a13aa0b56b9b00183a7773
Author: tallison <ta...@apache.org>
AuthorDate: Mon Feb 12 12:33:13 2024 -0500

    TIKA-4195 -- jsoup parser conceals backoff to default encoding
---
 .../org/apache/tika/detect/AutoDetectReader.java   | 38 ++++++++++++----------
 .../tika/detect/CompositeEncodingDetector.java     |  7 ++++
 .../apache/tika/metadata/TikaCoreProperties.java   | 16 +++++++++
 .../apache/tika/parser/html/HtmlParserTest.java    |  2 +-
 .../org/apache/tika/parser/txt/TXTParserTest.java  |  2 ++
 5 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
index 5cb920aae..bd7d4f2a9 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
@@ -22,8 +22,6 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.Charset;
-import java.util.Collections;
-import java.util.List;
 
 import org.xml.sax.InputSource;
 
@@ -31,6 +29,7 @@ import org.apache.tika.config.LoadErrorHandler;
 import org.apache.tika.config.ServiceLoader;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.utils.CharsetUtils;
 
@@ -68,26 +67,27 @@ public class AutoDetectReader extends BufferedReader {
     /**
      * @param stream    stream from which to read -- make sure that it supports mark!
      * @param metadata
-     * @param detectors
+     * @param detector
      * @param handler
      * @throws IOException
      * @throws TikaException
      */
     private AutoDetectReader(InputStream stream, Metadata metadata,
-                             List<EncodingDetector> detectors, LoadErrorHandler handler)
+                             EncodingDetector detector, LoadErrorHandler handler)
             throws IOException, TikaException {
-        this(stream, detect(stream, metadata, detectors, handler));
+        this(stream, detect(stream, metadata, detector, handler));
     }
 
     public AutoDetectReader(InputStream stream, Metadata metadata,
                             EncodingDetector encodingDetector) throws IOException, TikaException {
-        this(getBuffered(stream), metadata, Collections.singletonList(encodingDetector),
+        this(getBuffered(stream), metadata, encodingDetector,
                 DEFAULT_LOADER.getLoadErrorHandler());
     }
 
     public AutoDetectReader(InputStream stream, Metadata metadata, ServiceLoader loader)
             throws IOException, TikaException {
-        this(getBuffered(stream), metadata, loader.loadServiceProviders(EncodingDetector.class),
+        this(getBuffered(stream), metadata,
+                new CompositeEncodingDetector(loader.loadServiceProviders(EncodingDetector.class)),
                 loader.getLoadErrorHandler());
     }
 
@@ -101,19 +101,17 @@ public class AutoDetectReader extends BufferedReader {
     }
 
     private static Charset detect(InputStream input, Metadata metadata,
-                                  List<EncodingDetector> detectors, LoadErrorHandler handler)
+                                  EncodingDetector detector, LoadErrorHandler handler)
             throws IOException, TikaException {
         // Ask all given detectors for the character encoding
-        for (EncodingDetector detector : detectors) {
-            try {
-                Charset charset = detector.detect(input, metadata);
-                if (charset != null) {
-                    return charset;
-                }
-            } catch (NoClassDefFoundError e) {
-                // TIKA-1041: Detector dependencies not present.
-                handler.handleLoadError(detector.getClass().getName(), e);
+        try {
+            Charset charset = detector.detect(input, metadata);
+            if (charset != null) {
+                return charset;
             }
+        } catch (NoClassDefFoundError e) {
+            // TIKA-1041: Detector dependencies not present.
+            handler.handleLoadError(detector.getClass().getName(), e);
         }
 
         // Try determining the encoding based on hints in document metadata
@@ -122,7 +120,11 @@ public class AutoDetectReader extends BufferedReader {
             String charset = type.getParameters().get("charset");
             if (charset != null) {
                 try {
-                    return CharsetUtils.forName(charset);
+                    Charset cs = CharsetUtils.forName(charset);
+                    metadata.set(TikaCoreProperties.DETECTED_ENCODING, cs.name());
+                    metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+                            "AutoDetectReader-charset-metadata-fallback");
+                    return cs;
                 } catch (IllegalArgumentException e) {
                     // ignore
                 }
diff --git a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
index a50b7e4bc..7db79ccc7 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
@@ -26,6 +26,7 @@ import java.util.LinkedList;
 import java.util.List;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 
 public class CompositeEncodingDetector implements EncodingDetector, Serializable {
 
@@ -64,6 +65,12 @@ public class CompositeEncodingDetector implements EncodingDetector, Serializable
         for (EncodingDetector detector : getDetectors()) {
             Charset detected = detector.detect(input, metadata);
             if (detected != null) {
+                metadata.set(TikaCoreProperties.DETECTED_ENCODING, detected.name());
+                //if this has been set by a leaf detector, do not overwrite
+                if (! detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) {
+                    metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+                            detector.getClass().getSimpleName());
+                }
                 return detected;
             }
         }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index a75eb8acf..6ff02c1cf 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -322,6 +322,22 @@ public interface TikaCoreProperties {
     //is the file encrypted
     Property IS_ENCRYPTED = Property.internalBoolean(TIKA_META_PREFIX + "encrypted");
 
+    /**
+     * When an EncodingDetector detects an encoding, the encoding should be stored in this field.
+     * This is different from {@link Metadata#CONTENT_ENCODING} because that is what a parser
+     * chooses to use for processing a file. If an EncodingDetector returns "null", a parser
+     * may choose to use a default encoding. We want to differentiate between a parser using a
+     * default encoding and the output of an EncodingDetector.
+     */
+    Property DETECTED_ENCODING = Property.externalText(TIKA_META_PREFIX + "detectedEncoding");
+
+
+    /**
+     * This should be the simple class name for the EncodingDetectors whose detected encoding
+     * was used in the parse.
+     */
+    Property ENCODING_DETECTOR = Property.externalText(TIKA_META_PREFIX + "encodingDetector");
+
     /**
      * General metadata key for the count of non-final versions available within a file.  This
      * was added initially to support generalizing incremental updates in PDF.
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 279040bcf..502911fd3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -1055,7 +1055,7 @@ public class HtmlParserTest extends TikaTest {
         }
 
         assertEquals(1, (int) tagFrequencies.get("title"));
-        assertEquals(9, (int) tagFrequencies.get("meta"));
+        assertEquals(11, (int) tagFrequencies.get("meta"));
         assertEquals(12, (int) tagFrequencies.get("link"));
         assertEquals(6, (int) tagFrequencies.get("script"));
     }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index 576d6f6d9..f39be16c4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -106,6 +106,8 @@ public class TXTParserTest extends TikaTest {
         parser.parse(new ByteArrayInputStream(windows.getBytes("ISO-8859-15")),
                 new DefaultHandler(), metadata, new ParseContext());
         assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("UniversalEncodingDetector", metadata.get(TikaCoreProperties.ENCODING_DETECTOR));
+        assertEquals("windows-1252", metadata.get(TikaCoreProperties.DETECTED_ENCODING));
 
         metadata = new Metadata();
         parser.parse(new ByteArrayInputStream(unix.getBytes("ISO-8859-15")), new DefaultHandler(),