You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/20 19:24:57 UTC

tika git commit: TIKA-2219 make sure to transmit charset name in detectAll via Pascal Essiembre

Repository: tika
Updated Branches:
  refs/heads/2.x ee761ac00 -> 68f305864


TIKA-2219 make sure to transmit charset name in detectAll via Pascal Essiembre


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/68f30586
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/68f30586
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/68f30586

Branch: refs/heads/2.x
Commit: 68f3058643756d8e08f85903a585684f7d0f0b20
Parents: ee761ac
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 20 14:24:51 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 20 14:24:51 2016 -0500

----------------------------------------------------------------------
 .../java/org/apache/tika/parser/txt/CharsetDetector.java    | 4 +---
 .../org/apache/tika/parser/txt/CharsetDetectorTest.java     | 9 +++++++++
 .../src/test/resources/test-documents/testTXT_win-1252.txt  | 1 +
 3 files changed, 11 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/68f30586/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
index 17f0723..1d222b3 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
@@ -291,7 +291,6 @@ public class CharsetDetector {
         CharsetMatch charsetMatch;
         int confidence;
         ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
-
         //  Iterate over all possible charsets, remember all that
         //    give a match quality > 0.
         for (i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
@@ -308,8 +307,7 @@ public class CharsetDetector {
                         // Reduce lack of confidence (delta between "sure" and current) by 50%.
                         confidence += (MAX_CONFIDENCE - confidence) / 2;
                     }
-
-                    CharsetMatch m = new CharsetMatch(this, csr, confidence);
+                    CharsetMatch m = new CharsetMatch(this, csr, confidence, charsetMatch.getName(), charsetMatch.getLanguage());
                     matches.add(m);
                 }
             }

http://git-wip-us.apache.org/repos/asf/tika/blob/68f30586/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
index 9064597..645c6eb 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
@@ -62,4 +62,13 @@ public class CharsetDetectorTest extends TikaTest {
             assertTrue(reader.ready());
         }
     }
+
+    @Test
+    public void testWin125XHeuristics() throws Exception {
+        //TIKA-2219
+        CharsetDetector detector = new CharsetDetector();
+        detector.setText(getClass().getResourceAsStream("/test-documents/testTXT_win-1252.txt"));
+        CharsetMatch charset =  detector.detect();
+        assertEquals("windows-1252", charset.getName());
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/68f30586/tika-test-resources/src/test/resources/test-documents/testTXT_win-1252.txt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testTXT_win-1252.txt b/tika-test-resources/src/test/resources/test-documents/testTXT_win-1252.txt
new file mode 100644
index 0000000..519c955
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/testTXT_win-1252.txt
@@ -0,0 +1 @@
+These smart quotes are the trigger for CharsetRecog_sbcs to think this is a \ufffdwindows\ufffd encoding
\ No newline at end of file