You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/30 20:56:27 UTC
[tika] 01/02: TIKA-3374 -- allow users to turn off charset detection
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit e6f360e7ffc7a17bc75f6090dd4f1f9715924005
Author: tballison <ta...@apache.org>
AuthorDate: Fri Apr 30 16:47:04 2021 -0400
TIKA-3374 -- allow users to turn off charset detection
---
.../java/org/apache/tika/parser/pkg/PackageParser.java | 16 +++++++++++++++-
1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 38dde90..8113556 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -59,6 +59,7 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
+import org.apache.tika.config.Field;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
@@ -210,6 +211,8 @@ public class PackageParser extends AbstractEncodingDetectorParser {
return entrydata;
}
+ private boolean detectCharsetsInEntryNames = true;
+
public PackageParser() {
super();
}
@@ -405,7 +408,7 @@ public class PackageParser extends AbstractEncodingDetectorParser {
String name = entry.getName();
//Try to detect charset of archive entry in case of non-unicode filename is used
- if (entry instanceof ZipArchiveEntry) {
+ if (detectCharsetsInEntryNames && entry instanceof ZipArchiveEntry) {
Charset candidate =
getEncodingDetector().detect(new ByteArrayInputStream(((ZipArchiveEntry) entry).getRawName()),
parentMetadata);
@@ -497,4 +500,15 @@ public class PackageParser extends AbstractEncodingDetectorParser {
file.close();
}
}
+
+ /**
+ * Whether or not to run the default charset detector against entry
+ * names in ZipFiles. The default is <code>true</code>.
+ *
+ * @param detectCharsetsInEntryNames
+ */
+ @Field
+ public void setDetectCharsetsInEntryNames(boolean detectCharsetsInEntryNames) {
+ this.detectCharsetsInEntryNames = detectCharsetsInEntryNames;
+ }
}