You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/11/19 13:20:54 UTC

[tika] branch main updated: Add more judge between Charset Windows-1252 and ISO-8859-1(5) (#336)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new eea92fb  Add more judge between Charset Windows-1252 and ISO-8859-1(5) (#336)
eea92fb is described below

commit eea92fb045759300825ed6a727484df17e81e60b
Author: Lee <55...@users.noreply.github.com>
AuthorDate: Thu Nov 19 21:16:19 2020 +0800

    Add more judge between Charset Windows-1252 and ISO-8859-1(5) (#336)
    
    Judge if content has hex value which is nonexistent in Charset Windows-1252.
---
 .../apache/tika/parser/txt/UniversalEncodingListener.java | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
index 179466d..c598552 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
@@ -59,7 +59,8 @@ class UniversalEncodingListener implements CharsetListener {
             if (hint != null) {
                 // Use the encoding hint when available
                 name = hint;
-            } else if (statistics.count('\r') == 0) {
+            } else if (hasNonexistentHexInCharsetWindows1252() || statistics.count('\r') == 0) {
+                // If it has nonexistent hex value in charset windows-1252 or
                 // If there are no CR(LF)s, then the encoding is more
                 // likely to be ISO-8859-1(5) than windows-1252
                 if (statistics.count(0xa4) > 0) { // currency/euro sign
@@ -97,4 +98,16 @@ class UniversalEncodingListener implements CharsetListener {
         return charset;
     }
 
+    /*
+    * hex value 0x81, 0x8d, 0x8f, 0x90, 0x9d don't exist in charset windows-1252.
+    * If these value's count > 0, return true
+    * */
+    private Boolean hasNonexistentHexInCharsetWindows1252() {
+        return (statistics.count(0x81) > 0 ||
+                statistics.count(0x8d) > 0 ||
+                statistics.count(0x8f) > 0 ||
+                statistics.count(0x90) > 0 ||
+                statistics.count(0x9d) > 0);
+    }
+
 }
\ No newline at end of file