You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by rg...@apache.org on 2012/08/01 15:44:52 UTC
svn commit: r1367991 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/detect/
tika-parsers/src/test/java/org/apache/tika/parser/
tika-parsers/src/test/resources/test-documents/
Author: rgauss
Date: Wed Aug 1 13:44:52 2012
New Revision: 1367991
URL: http://svn.apache.org/viewvc?rev=1367991&view=rev
Log:
TIKA-965: Text Detection Fails on Mostly Non-ASCII UTF-8 Files
- Added looksLikeUTF8 method to TextStatistics
- Added check to TextDetector.detect for looksLikeUTF8
- Added testTextNonASCIIUTF8 to AutoDetectParserTest and testTextNonASCIIUTF8.txt test resource
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testTXTNonASCIIUTF8.txt (with props)
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java?rev=1367991&r1=1367990&r2=1367991&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java Wed Aug 1 13:44:52 2012
@@ -127,7 +127,7 @@ public class TextDetector implements Det
m = input.read(buffer, 0, Math.min(bytesToTest - n, buffer.length));
}
- if (stats.isMostlyAscii()) {
+ if (stats.isMostlyAscii() || stats.looksLikeUTF8()) {
return MediaType.TEXT_PLAIN;
} else {
return MediaType.OCTET_STREAM;
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java?rev=1367991&r1=1367990&r2=1367991&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java Wed Aug 1 13:44:52 2012
@@ -53,6 +53,34 @@ public class TextStatistics {
}
/**
+ * Checks whether the observed byte stream looks like UTF-8 encoded text.
+ *
+ * @since Apache Tika 1.3
+ * @return <code>true</code> if the seen bytes look like UTF-8,
+ * <code>false</code> otherwise
+ */
+ public boolean looksLikeUTF8() {
+ int control = count(0, 0x20);
+ int utf8 = count(0x20, 0x80);
+ int safe = countSafeControl();
+
+ int expectedContinuation = 0;
+ int[] leading = new int[] {
+ count(0xc0, 0xe0), count(0xe0, 0xf0), count(0xf0, 0xf8) };
+ for (int i = 0; i < leading.length; i++) {
+ utf8 += leading[i];
+ expectedContinuation += (i + 1) * leading[i];
+ }
+
+ int continuation = count(0x80, 0xc0);
+ return utf8 > 0
+ && continuation <= expectedContinuation
+ && continuation >= expectedContinuation - 3
+ && count(0xf80, 0x100) == 0
+ && (control - safe) * 100 < utf8 * 2;
+ }
+
+ /**
* Returns the total number of bytes seen so far.
*
* @return count of all bytes
@@ -117,7 +145,7 @@ public class TextStatistics {
}
private int count(int from, int to) {
- assert 0 <= from && to < counts.length;
+ assert 0 <= from && to <= counts.length;
int count = 0;
for (int i = from; i < to; i++) {
count += counts[i];
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=1367991&r1=1367990&r2=1367991&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java Wed Aug 1 13:44:52 2012
@@ -47,6 +47,7 @@ public class AutoDetectParserTest extend
private static final String NUMBERS = "application/vnd.apple.numbers";
private static final String RTF = "application/rtf";
private static final String PLAINTEXT = "text/plain; charset=ISO-8859-1";
+ private static final String UTF8TEXT = "text/plain; charset=UTF-8";
private static final String WORD = "application/msword";
private static final String XML = "application/xml";
private static final String RSS = "application/rss+xml";
@@ -195,6 +196,10 @@ public class AutoDetectParserTest extend
public void testText() throws Exception {
assertAutoDetect("testTXT.txt", PLAINTEXT, "indexation de Txt");
}
+
+ public void testTextNonASCIIUTF8() throws Exception {
+ assertAutoDetect("testTXTNonASCIIUTF8.txt", UTF8TEXT, "The quick brown fox jumps over the lazy dog");
+ }
public void testWord() throws Exception {
assertAutoDetect("testWORD.doc", WORD, "Sample Word Document");
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testTXTNonASCIIUTF8.txt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testTXTNonASCIIUTF8.txt?rev=1367991&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testTXTNonASCIIUTF8.txt (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testTXTNonASCIIUTF8.txt Wed Aug 1 13:44:52 2012
@@ -0,0 +1,7 @@
+The quick brown fox jumps over the lazy dog
+
+Le renard brun rapide saute par-dessus le chien paresseux
+
+Der schnelle braune Fuchs springt über den faulen Hund
+
+براÙÙ ÙÙس اÙسرÙع ÙÙÙز ÙÙ٠اÙÙÙب ÙساÙÙ
\ No newline at end of file
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testTXTNonASCIIUTF8.txt
------------------------------------------------------------------------------
svn:eol-style = native