You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/17 13:48:52 UTC
svn commit: r1171952 - in /tika/trunk/tika-core/src:
main/java/org/apache/tika/detect/TextDetector.java
test/java/org/apache/tika/detect/TextDetectorTest.java
Author: jukka
Date: Sat Sep 17 11:48:51 2011
New Revision: 1171952
URL: http://svn.apache.org/viewvc?rev=1171952&view=rev
Log:
TIKA-688: Enhance content-type detector to recognize almost plain text
Allow up to 2% control chars and up to 10% non-ASCII chars among plain text
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java?rev=1171952&r1=1171951&r2=1171952&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java Sat Sep 17 11:48:51 2011
@@ -93,20 +93,34 @@ public class TextDetector implements Det
input.mark(NUMBER_OF_BYTES_TO_TEST);
try {
- for (int i = 0; i < NUMBER_OF_BYTES_TO_TEST; i++) {
- int ch = input.read();
- if (ch == -1) {
- if (i > 0) {
- return MediaType.TEXT_PLAIN;
- } else {
- // See https://issues.apache.org/jira/browse/TIKA-483
- return MediaType.OCTET_STREAM;
- }
- } else if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
- return MediaType.OCTET_STREAM;
+ int chars = 0;
+ int controls = 0;
+ int asciis = 0;
+ int ch = input.read();
+ while (ch != -1 && chars < NUMBER_OF_BYTES_TO_TEST) {
+ if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
+ controls++;
+ } else if (ch < 127) {
+ asciis++;
}
+ ch = input.read();
+ chars++;
+ }
+ if (chars == 0) {
+ // Empty document, so treat it as binary
+ // See https://issues.apache.org/jira/browse/TIKA-483
+ return MediaType.OCTET_STREAM;
+ } else if (controls == 0) {
+ // No control characters, so treat it as text
+ return MediaType.TEXT_PLAIN;
+ } else if (controls < chars * 2 / 100
+ && asciis > chars * 90 / 100) {
+ // Almost plain text (< 2% control, > 90% ASCII range)
+ // See https://issues.apache.org/jira/browse/TIKA-688
+ return MediaType.TEXT_PLAIN;
+ } else {
+ return MediaType.OCTET_STREAM;
}
- return MediaType.TEXT_PLAIN;
} finally {
input.reset();
}
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java?rev=1171952&r1=1171951&r2=1171952&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java Sat Sep 17 11:48:51 2011
@@ -58,12 +58,21 @@ public class TextDetectorTest extends Te
byte[] data = new byte[512];
Arrays.fill(data, (byte) '.');
assertText(data);
+ Arrays.fill(data, 100, 109, (byte) 0x1f);
+ assertText(data); // almost text
+ Arrays.fill(data, 100, 110, (byte) 0x1f);
+ assertNotText(data); // no longer almost text, too many control chars
Arrays.fill(data, (byte) 0x1f);
assertNotText(data);
data = new byte[513];
Arrays.fill(data, (byte) '.');
+ data[0] = 0x1f;
assertText(data);
+ Arrays.fill(data, 100, 150, (byte) 0x83);
+ assertText(data); // almost text
+ Arrays.fill(data, 100, 200, (byte) 0x83);
+ assertNotText(data); // no longer almost text, too many non-ASCII
Arrays.fill(data, (byte) 0x1f);
assertNotText(data);
}