You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by rg...@apache.org on 2012/08/01 15:44:52 UTC

svn commit: r1367991 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/detect/ tika-parsers/src/test/java/org/apache/tika/parser/ tika-parsers/src/test/resources/test-documents/

Author: rgauss
Date: Wed Aug  1 13:44:52 2012
New Revision: 1367991

URL: http://svn.apache.org/viewvc?rev=1367991&view=rev
Log:
TIKA-965: Text Detection Fails on Mostly Non-ASCII UTF-8 Files
   - Added looksLikeUTF8 method to TextStatistics
   - Added check to TextDetector.detect for looksLikeUTF8
   - Added testTextNonASCIIUTF8 to AutoDetectParserTest and testTextNonASCIIUTF8.txt test resource

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testTXTNonASCIIUTF8.txt   (with props)
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java?rev=1367991&r1=1367990&r2=1367991&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java Wed Aug  1 13:44:52 2012
@@ -127,7 +127,7 @@ public class TextDetector implements Det
                 m = input.read(buffer, 0, Math.min(bytesToTest - n, buffer.length));
             }
 
-            if (stats.isMostlyAscii()) {
+            if (stats.isMostlyAscii() || stats.looksLikeUTF8()) {
                 return MediaType.TEXT_PLAIN;
             } else {
                 return MediaType.OCTET_STREAM;

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java?rev=1367991&r1=1367990&r2=1367991&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java Wed Aug  1 13:44:52 2012
@@ -53,6 +53,34 @@ public class TextStatistics {
     }
 
     /**
+     * Checks whether the observed byte stream looks like UTF-8 encoded text.
+     *
+     * @since Apache Tika 1.3
+     * @return <code>true</code> if the seen bytes look like UTF-8,
+     *         <code>false</code> otherwise
+     */
+    public boolean looksLikeUTF8() {
+        int control = count(0, 0x20);
+        int utf8 = count(0x20, 0x80);
+        int safe = countSafeControl();
+
+        int expectedContinuation = 0;
+        int[] leading = new int[] {
+                count(0xc0, 0xe0), count(0xe0, 0xf0), count(0xf0, 0xf8) };
+        for (int i = 0; i < leading.length; i++) {
+            utf8 += leading[i];
+            expectedContinuation += (i + 1) * leading[i];
+        }
+
+        int continuation = count(0x80, 0xc0);
+        return utf8 > 0
+                && continuation <= expectedContinuation
+                && continuation >= expectedContinuation - 3
+                && count(0xf80, 0x100) == 0
+                && (control - safe) * 100 < utf8 * 2;
+    }
+
+    /**
      * Returns the total number of bytes seen so far.
      *
      * @return count of all bytes
@@ -117,7 +145,7 @@ public class TextStatistics {
     }
 
     private int count(int from, int to) {
-        assert 0 <= from && to < counts.length;
+        assert 0 <= from && to <= counts.length;
         int count = 0;
         for (int i = from; i < to; i++) {
             count += counts[i];

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=1367991&r1=1367990&r2=1367991&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java Wed Aug  1 13:44:52 2012
@@ -47,6 +47,7 @@ public class AutoDetectParserTest extend
     private static final String NUMBERS    = "application/vnd.apple.numbers";
     private static final String RTF        = "application/rtf";
     private static final String PLAINTEXT  = "text/plain; charset=ISO-8859-1";
+    private static final String UTF8TEXT   = "text/plain; charset=UTF-8";
     private static final String WORD       = "application/msword";
     private static final String XML        = "application/xml";
     private static final String RSS        = "application/rss+xml";
@@ -195,6 +196,10 @@ public class AutoDetectParserTest extend
     public void testText() throws Exception {
         assertAutoDetect("testTXT.txt", PLAINTEXT, "indexation de Txt");
     }
+    
+    public void testTextNonASCIIUTF8() throws Exception {
+        assertAutoDetect("testTXTNonASCIIUTF8.txt", UTF8TEXT, "The quick brown fox jumps over the lazy dog");
+    }
 
     public void testWord() throws Exception {
         assertAutoDetect("testWORD.doc", WORD, "Sample Word Document");

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testTXTNonASCIIUTF8.txt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testTXTNonASCIIUTF8.txt?rev=1367991&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testTXTNonASCIIUTF8.txt (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testTXTNonASCIIUTF8.txt Wed Aug  1 13:44:52 2012
@@ -0,0 +1,7 @@
+The quick brown fox jumps over the lazy dog
+
+Le renard brun rapide saute par-dessus le chien paresseux
+
+Der schnelle braune Fuchs springt über den faulen Hund
+
+براون وكس السريع يقفز فوق الكلب كسالي
\ No newline at end of file

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testTXTNonASCIIUTF8.txt
------------------------------------------------------------------------------
    svn:eol-style = native