You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/11/19 13:20:54 UTC
[tika] branch main updated: Add more judge between Charset
Windows-1252 and ISO-8859-1(5) (#336)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new eea92fb Add more judge between Charset Windows-1252 and ISO-8859-1(5) (#336)
eea92fb is described below
commit eea92fb045759300825ed6a727484df17e81e60b
Author: Lee <55...@users.noreply.github.com>
AuthorDate: Thu Nov 19 21:16:19 2020 +0800
Add more judge between Charset Windows-1252 and ISO-8859-1(5) (#336)
Judge if content has hex value which is nonexistent in Charset Windows-1252.
---
.../apache/tika/parser/txt/UniversalEncodingListener.java | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
index 179466d..c598552 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java
@@ -59,7 +59,8 @@ class UniversalEncodingListener implements CharsetListener {
if (hint != null) {
// Use the encoding hint when available
name = hint;
- } else if (statistics.count('\r') == 0) {
+ } else if (hasNonexistentHexInCharsetWindows1252() || statistics.count('\r') == 0) {
+ // If it has nonexistent hex value in charset windows-1252 or
// If there are no CR(LF)s, then the encoding is more
// likely to be ISO-8859-1(5) than windows-1252
if (statistics.count(0xa4) > 0) { // currency/euro sign
@@ -97,4 +98,16 @@ class UniversalEncodingListener implements CharsetListener {
return charset;
}
+ /*
+ * hex value 0x81, 0x8d, 0x8f, 0x90, 0x9d don't exist in charset windows-1252.
+ * If these value's count > 0, return true
+ * */
+ private Boolean hasNonexistentHexInCharsetWindows1252() {
+ return (statistics.count(0x81) > 0 ||
+ statistics.count(0x8d) > 0 ||
+ statistics.count(0x8f) > 0 ||
+ statistics.count(0x90) > 0 ||
+ statistics.count(0x9d) > 0);
+ }
+
}
\ No newline at end of file