You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/18 11:15:16 UTC
svn commit: r1172230 - in
/tika/trunk/tika-core/src/main/java/org/apache/tika:
detect/TextDetector.java mime/MimeTypes.java
Author: jukka
Date: Sun Sep 18 09:15:16 2011
New Revision: 1172230
URL: http://svn.apache.org/viewvc?rev=1172230&view=rev
Log:
TIKA-688: Enhance content-type detector to recognize almost plain text
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java (contents, props changed)
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java?rev=1172230&r1=1172229&r2=1172230&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java Sun Sep 18 09:15:16 2011
@@ -27,7 +27,9 @@ import org.apache.tika.mime.MediaType;
* Content type detection of plain text documents. This detector looks at the
* beginning of the document input stream and considers the document to be
* a text document if no ASCII (ISO-Latin-1, UTF-8, etc.) control bytes are
- * found.
+ * found. As a special case some control bytes (up to 2% of all characters)
+ * are also allowed in a text document if it also contains no or just a few
+ * (less than 10%) characters above the 7-bit ASCII range.
* <p>
* Note that text documents with a character encoding like UTF-16 are better
* detected with {@link MagicDetector} and an appropriate magic byte pattern.
@@ -36,6 +38,9 @@ import org.apache.tika.mime.MediaType;
*/
public class TextDetector implements Detector {
+ /** Serial version UID */
+ private static final long serialVersionUID = 4774601079503507765L;
+
/**
* The number of bytes from the beginning of the document stream
* to test for control bytes.
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=1172230&r1=1172229&r2=1172230&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java Sun Sep 18 09:15:16 2011
@@ -36,6 +36,7 @@ import javax.xml.namespace.QName;
import org.apache.tika.Tika;
import org.apache.tika.detect.Detector;
+import org.apache.tika.detect.TextDetector;
import org.apache.tika.detect.XmlRootExtractor;
import org.apache.tika.metadata.Metadata;
@@ -75,41 +76,6 @@ public final class MimeTypes implements
*/
public static final String XML = "application/xml";
-
-
- /**
- * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
- * in the range below 0x20 (the space character). If an entry in this
- * table is <code>true</code> then that byte is very unlikely to occur
- * in a plain text document.
- * <p>
- * The contents of this lookup table are based on the following definition
- * from section 4 of the "Content-Type Processing Model" Internet-draft
- * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
- * >draft-abarth-mime-sniff-01</a>).
- * <pre>
- * +-------------------------+
- * | Binary data byte ranges |
- * +-------------------------+
- * | 0x00 -- 0x08 |
- * | 0x0B |
- * | 0x0E -- 0x1A |
- * | 0x1C -- 0x1F |
- * +-------------------------+
- * </pre>
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
- */
- private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20];
- static {
- Arrays.fill(IS_CONTROL_BYTE, true);
- IS_CONTROL_BYTE[0x09] = false; // tabulator
- IS_CONTROL_BYTE[0x0A] = false; // new line
- IS_CONTROL_BYTE[0x0C] = false; // new page
- IS_CONTROL_BYTE[0x0D] = false; // carriage return
- IS_CONTROL_BYTE[0x1B] = false; // escape
- }
-
/**
* Root type, application/octet-stream.
*/
@@ -255,15 +221,14 @@ public final class MimeTypes implements
return result;
}
-
// Finally, assume plain text if no control bytes are found
- for (int i = 0; i < data.length; i++) {
- int b = data[i] & 0xFF; // prevent sign extension
- if (b < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[b]) {
- return rootMimeType;
- }
+ try {
+ TextDetector detector = new TextDetector();
+ ByteArrayInputStream stream = new ByteArrayInputStream(data);
+ return forName(detector.detect(stream, new Metadata()).toString());
+ } catch (Exception e) {
+ return rootMimeType;
}
- return textMimeType;
}
/**
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
------------------------------------------------------------------------------
svn:executable = *