You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/20 19:24:57 UTC
tika git commit: TIKA-2219 make sure to transmit charset name in
detectAll via Pascal Essiembre
Repository: tika
Updated Branches:
refs/heads/2.x ee761ac00 -> 68f305864
TIKA-2219 make sure to transmit charset name in detectAll via Pascal Essiembre
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/68f30586
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/68f30586
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/68f30586
Branch: refs/heads/2.x
Commit: 68f3058643756d8e08f85903a585684f7d0f0b20
Parents: ee761ac
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 20 14:24:51 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 20 14:24:51 2016 -0500
----------------------------------------------------------------------
.../java/org/apache/tika/parser/txt/CharsetDetector.java | 4 +---
.../org/apache/tika/parser/txt/CharsetDetectorTest.java | 9 +++++++++
.../src/test/resources/test-documents/testTXT_win-1252.txt | 1 +
3 files changed, 11 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/68f30586/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
index 17f0723..1d222b3 100644
--- a/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
+++ b/tika-parser-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
@@ -291,7 +291,6 @@ public class CharsetDetector {
CharsetMatch charsetMatch;
int confidence;
ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
-
// Iterate over all possible charsets, remember all that
// give a match quality > 0.
for (i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
@@ -308,8 +307,7 @@ public class CharsetDetector {
// Reduce lack of confidence (delta between "sure" and current) by 50%.
confidence += (MAX_CONFIDENCE - confidence) / 2;
}
-
- CharsetMatch m = new CharsetMatch(this, csr, confidence);
+ CharsetMatch m = new CharsetMatch(this, csr, confidence, charsetMatch.getName(), charsetMatch.getLanguage());
matches.add(m);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/68f30586/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
index 9064597..645c6eb 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
@@ -62,4 +62,13 @@ public class CharsetDetectorTest extends TikaTest {
assertTrue(reader.ready());
}
}
+
+ @Test
+ public void testWin125XHeuristics() throws Exception {
+ //TIKA-2219
+ CharsetDetector detector = new CharsetDetector();
+ detector.setText(getClass().getResourceAsStream("/test-documents/testTXT_win-1252.txt"));
+ CharsetMatch charset = detector.detect();
+ assertEquals("windows-1252", charset.getName());
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/68f30586/tika-test-resources/src/test/resources/test-documents/testTXT_win-1252.txt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testTXT_win-1252.txt b/tika-test-resources/src/test/resources/test-documents/testTXT_win-1252.txt
new file mode 100644
index 0000000..519c955
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/testTXT_win-1252.txt
@@ -0,0 +1 @@
+These smart quotes are the trigger for CharsetRecog_sbcs to think this is a \ufffdwindows\ufffd encoding
\ No newline at end of file