You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/17 13:48:52 UTC

svn commit: r1171952 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/detect/TextDetector.java test/java/org/apache/tika/detect/TextDetectorTest.java

Author: jukka
Date: Sat Sep 17 11:48:51 2011
New Revision: 1171952

URL: http://svn.apache.org/viewvc?rev=1171952&view=rev
Log:
TIKA-688: Enhance content-type detector to recognize almost plain text

Allow up to 2% control chars and up to 10% non-ASCII chars among plain text

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java?rev=1171952&r1=1171951&r2=1171952&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java Sat Sep 17 11:48:51 2011
@@ -93,20 +93,34 @@ public class TextDetector implements Det
 
         input.mark(NUMBER_OF_BYTES_TO_TEST);
         try {
-            for (int i = 0; i < NUMBER_OF_BYTES_TO_TEST; i++) {
-                int ch = input.read();
-                if (ch == -1) {
-                    if (i > 0) {
-                        return MediaType.TEXT_PLAIN;
-                    } else {
-                        // See https://issues.apache.org/jira/browse/TIKA-483
-                        return MediaType.OCTET_STREAM;
-                    }
-                } else if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
-                    return MediaType.OCTET_STREAM;
+            int chars = 0;
+            int controls = 0;
+            int asciis = 0;
+            int ch = input.read();
+            while (ch != -1 && chars < NUMBER_OF_BYTES_TO_TEST) {
+                if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
+                    controls++;
+                } else if (ch < 127) {
+                    asciis++;
                 }
+                ch = input.read();
+                chars++;
+            }
+            if (chars == 0) {
+                // Empty document, so treat it as binary
+                // See https://issues.apache.org/jira/browse/TIKA-483
+                return MediaType.OCTET_STREAM;
+            } else if (controls == 0) {
+                // No control characters, so treat it as text
+                return MediaType.TEXT_PLAIN;
+            } else if (controls < chars * 2 / 100
+                    && asciis > chars * 90 / 100) {
+                // Almost plain text (< 2% control, > 90% ASCII range)
+                // See https://issues.apache.org/jira/browse/TIKA-688
+                return MediaType.TEXT_PLAIN;
+            } else {
+                return MediaType.OCTET_STREAM;
             }
-            return MediaType.TEXT_PLAIN;
         } finally {
             input.reset();
         }

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java?rev=1171952&r1=1171951&r2=1171952&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java Sat Sep 17 11:48:51 2011
@@ -58,12 +58,21 @@ public class TextDetectorTest extends Te
         byte[] data = new byte[512];
         Arrays.fill(data, (byte) '.');
         assertText(data);
+        Arrays.fill(data, 100, 109, (byte) 0x1f);
+        assertText(data); // almost text
+        Arrays.fill(data, 100, 110, (byte) 0x1f);
+        assertNotText(data); // no longer almost text, too many control chars
         Arrays.fill(data, (byte) 0x1f);
         assertNotText(data);
 
         data = new byte[513];
         Arrays.fill(data, (byte) '.');
+        data[0] = 0x1f;
         assertText(data);
+        Arrays.fill(data, 100, 150, (byte) 0x83);
+        assertText(data); // almost text
+        Arrays.fill(data, 100, 200, (byte) 0x83);
+        assertNotText(data); // no longer almost text, too many non-ASCII
         Arrays.fill(data, (byte) 0x1f);
         assertNotText(data);
     }