You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/18 11:15:16 UTC

svn commit: r1172230 - in /tika/trunk/tika-core/src/main/java/org/apache/tika: detect/TextDetector.java mime/MimeTypes.java

Author: jukka
Date: Sun Sep 18 09:15:16 2011
New Revision: 1172230

URL: http://svn.apache.org/viewvc?rev=1172230&view=rev
Log:
TIKA-688: Enhance content-type detector to recognize almost plain text

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java   (contents, props changed)

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java?rev=1172230&r1=1172229&r2=1172230&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java Sun Sep 18 09:15:16 2011
@@ -27,7 +27,9 @@ import org.apache.tika.mime.MediaType;
  * Content type detection of plain text documents. This detector looks at the
  * beginning of the document input stream and considers the document to be
  * a text document if no ASCII (ISO-Latin-1, UTF-8, etc.) control bytes are
- * found.
+ * found. As a special case some control bytes (up to 2% of all characters)
+ * are also allowed in a text document if it also contains no or just a few
+ * (less than 10%) characters above the 7-bit ASCII range.
  * <p>
  * Note that text documents with a character encoding like UTF-16 are better
  * detected with {@link MagicDetector} and an appropriate magic byte pattern.
@@ -36,6 +38,9 @@ import org.apache.tika.mime.MediaType;
  */
 public class TextDetector implements Detector {
 
+    /** Serial version UID */
+    private static final long serialVersionUID = 4774601079503507765L;
+
     /**
      * The number of bytes from the beginning of the document stream
      * to test for control bytes.

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=1172230&r1=1172229&r2=1172230&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java Sun Sep 18 09:15:16 2011
@@ -36,6 +36,7 @@ import javax.xml.namespace.QName;
 
 import org.apache.tika.Tika;
 import org.apache.tika.detect.Detector;
+import org.apache.tika.detect.TextDetector;
 import org.apache.tika.detect.XmlRootExtractor;
 import org.apache.tika.metadata.Metadata;
 
@@ -75,41 +76,6 @@ public final class MimeTypes implements 
      */
     public static final String XML = "application/xml";
 
-
-    
-    /**
-     * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
-     * in the range below 0x20 (the space character). If an entry in this
-     * table is <code>true</code> then that byte is very unlikely to occur
-     * in a plain text document.
-     * <p>
-     * The contents of this lookup table are based on the following definition
-     * from section 4 of the "Content-Type Processing Model" Internet-draft
-     * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
-     * >draft-abarth-mime-sniff-01</a>).
-     * <pre>
-     * +-------------------------+
-     * | Binary data byte ranges |
-     * +-------------------------+
-     * | 0x00 -- 0x08            |
-     * | 0x0B                    |
-     * | 0x0E -- 0x1A            |
-     * | 0x1C -- 0x1F            |
-     * +-------------------------+
-     * </pre>
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
-     */
-    private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20];
-    static {
-        Arrays.fill(IS_CONTROL_BYTE, true);
-        IS_CONTROL_BYTE[0x09] = false; // tabulator
-        IS_CONTROL_BYTE[0x0A] = false; // new line
-        IS_CONTROL_BYTE[0x0C] = false; // new page
-        IS_CONTROL_BYTE[0x0D] = false; // carriage return
-        IS_CONTROL_BYTE[0x1B] = false; // escape
-    }
-
     /**
      * Root type, application/octet-stream.
      */
@@ -255,15 +221,14 @@ public final class MimeTypes implements 
             return result;
         }
 
-
         // Finally, assume plain text if no control bytes are found
-        for (int i = 0; i < data.length; i++) {
-            int b = data[i] & 0xFF; // prevent sign extension
-            if (b < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[b]) {
-                return rootMimeType;
-            }
+        try {
+            TextDetector detector = new TextDetector();
+            ByteArrayInputStream stream = new ByteArrayInputStream(data);
+            return forName(detector.detect(stream, new Metadata()).toString());
+        } catch (Exception e) {
+            return rootMimeType;
         }
-        return textMimeType;
     }
 
     /**

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
------------------------------------------------------------------------------
    svn:executable = *