You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by lf...@apache.org on 2022/05/24 02:24:23 UTC

[tika] branch main updated: TIKA-3774: fix ignoreCharsets param of Icu4jEncodingDetector

This is an automated email from the ASF dual-hosted git repository.

lfcnassif pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 768526160 TIKA-3774: fix ignoreCharsets param of Icu4jEncodingDetector
     new d5b66db06 Merge branch 'main' of https://github.com/apache/tika into main
768526160 is described below

commit 768526160b3d12fc4df4671e093e101ccc44eb22
Author: Luis Nassif <lf...@gmail.com>
AuthorDate: Mon May 23 23:17:18 2022 -0300

    TIKA-3774: fix ignoreCharsets param of Icu4jEncodingDetector
---
 .../apache/tika/parser/txt/Icu4jEncodingDetector.java    |   2 +-
 .../org/apache/tika/parser/txt/CharsetDetectorTest.java  |  10 ++++++++--
 .../test-configs/tika-config-ignore-charset.xml          |   1 +
 .../resources/test-documents/test_ignore_IBM420.html     | Bin 0 -> 1869 bytes
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
index ce9ee9fa4..f89b27c12 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
@@ -75,7 +75,7 @@ public class Icu4jEncodingDetector implements EncodingDetector {
             try {
                 String n = match.getNormalizedName();
                 if (ignoreCharsets.contains(n)) {
-                    return null;
+                    continue;
                 }
                 return CharsetUtils.forName(match.getNormalizedName());
             } catch (IllegalArgumentException e) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
index d41e3498f..3e6594cb0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
@@ -136,10 +136,16 @@ public class CharsetDetectorTest extends TikaTest {
         TikaConfig tikaConfig = new TikaConfig(
                 getResourceAsStream("/test-configs/tika-config-ignore-charset.xml"));
 
-        Metadata m = new Metadata();
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
 
+        Metadata m = new Metadata();
         m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt");
         assertContains("ACTIVE AGE", getXML("testIgnoreCharset.txt",
-                new AutoDetectParser(tikaConfig), m).xml);
+                parser, m).xml);
+
+        m = new Metadata();
+        m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt");
+        assertContains("Please check your email", getXML("test_ignore_IBM420.html",
+                parser, m).xml);
     }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml
index 0b61f20c9..2ca84a940 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-configs/tika-config-ignore-charset.xml
@@ -23,6 +23,7 @@
         <encodingDetector class="org.apache.tika.parser.txt.Icu4jEncodingDetector">
             <params>
                 <param name="ignoreCharsets" type="list">
+                    <string>IBM420</string>
                     <string>IBM424</string>
                 </param>
             </params>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/test_ignore_IBM420.html b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/test_ignore_IBM420.html
new file mode 100644
index 000000000..2aecab221
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/resources/test-documents/test_ignore_IBM420.html differ