You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/30 20:56:27 UTC

[tika] 01/02: TIKA-3374 -- allow users to turn off charset detection

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e6f360e7ffc7a17bc75f6090dd4f1f9715924005
Author: tballison <ta...@apache.org>
AuthorDate: Fri Apr 30 16:47:04 2021 -0400

    TIKA-3374 -- allow users to turn off charset detection
---
 .../java/org/apache/tika/parser/pkg/PackageParser.java   | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 38dde90..8113556 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -59,6 +59,7 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
+import org.apache.tika.config.Field;
 import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
@@ -210,6 +211,8 @@ public class PackageParser extends AbstractEncodingDetectorParser {
         return entrydata;
     }
 
+    private boolean detectCharsetsInEntryNames = true;
+
     public PackageParser() {
         super();
     }
@@ -405,7 +408,7 @@ public class PackageParser extends AbstractEncodingDetectorParser {
         String name = entry.getName();
         
         //Try to detect charset of archive entry in case of non-unicode filename is used
-        if (entry instanceof ZipArchiveEntry) {
+        if (detectCharsetsInEntryNames && entry instanceof ZipArchiveEntry) {
             Charset candidate =
                     getEncodingDetector().detect(new ByteArrayInputStream(((ZipArchiveEntry) entry).getRawName()),
                         parentMetadata);
@@ -497,4 +500,15 @@ public class PackageParser extends AbstractEncodingDetectorParser {
             file.close();
         }
     }
+
+    /**
+     * Whether or not to run the default charset detector against entry
+     * names in ZipFiles. The default is <code>true</code>.
+     *
+     * @param detectCharsetsInEntryNames
+     */
+    @Field
+    public void setDetectCharsetsInEntryNames(boolean detectCharsetsInEntryNames) {
+        this.detectCharsetsInEntryNames = detectCharsetsInEntryNames;
+    }
 }