You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by lf...@apache.org on 2022/05/24 02:24:23 UTC
[tika] branch main updated: TIKA-3774: fix ignoreCharsets param of Icu4jEncodingDetector
This is an automated email from the ASF dual-hosted git repository.
lfcnassif pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 768526160 TIKA-3774: fix ignoreCharsets param of Icu4jEncodingDetector
new d5b66db06 Merge branch 'main' of https://github.com/apache/tika into main
768526160 is described below
commit 768526160b3d12fc4df4671e093e101ccc44eb22
Author: Luis Nassif <lf...@gmail.com>
AuthorDate: Mon May 23 23:17:18 2022 -0300
TIKA-3774: fix ignoreCharsets param of Icu4jEncodingDetector
---
.../apache/tika/parser/txt/Icu4jEncodingDetector.java | 2 +-
.../org/apache/tika/parser/txt/CharsetDetectorTest.java | 10 ++++++++--
.../test-configs/tika-config-ignore-charset.xml | 1 +
.../resources/test-documents/test_ignore_IBM420.html | Bin 0 -> 1869 bytes
4 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
index ce9ee9fa4..f89b27c12 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
@@ -75,7 +75,7 @@ public class Icu4jEncodingDetector implements EncodingDetector {
try {
String n = match.getNormalizedName();
if (ignoreCharsets.contains(n)) {
- return null;
+ continue;
}
return CharsetUtils.forName(match.getNormalizedName());
} catch (IllegalArgumentException e) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
index d41e3498f..3e6594cb0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
@@ -136,10 +136,16 @@ public class CharsetDetectorTest extends TikaTest {
TikaConfig tikaConfig = new TikaConfig(
getResourceAsStream("/test-configs/tika-config-ignore-charset.xml"));
- Metadata m = new Metadata();
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ Metadata m = new Metadata();
m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt");
assertContains("ACTIVE AGE", getXML("testIgnoreCharset.txt",
- new AutoDetectParser(tikaConfig), m).xml);
+ parser, m).xml);
+
+ m = new Metadata();
+ m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt");
+ assertContains("Please check your email", getXML("test_ignore_IBM420.html",
+ parser, m).xml);
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml
index 0b61f20c9..2ca84a940 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml
@@ -23,6 +23,7 @@
<encodingDetector class="org.apache.tika.parser.txt.Icu4jEncodingDetector">
<params>
<param name="ignoreCharsets" type="list">
+ <string>IBM420</string>
<string>IBM424</string>
</param>
</params>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/test_ignore_IBM420.html b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/test_ignore_IBM420.html
new file mode 100644
index 000000000..2aecab221
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/test_ignore_IBM420.html differ