You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/01/27 00:57:17 UTC
svn commit: r737942 - in /lucene/tika/trunk/src: main/java/org/apache/tika/detect/TextDetector.java test/java/org/apache/tika/detect/TextDetectorTest.java

Author: jukka
Date: Mon Jan 26 23:57:17 2009
New Revision: 737942

URL: http://svn.apache.org/viewvc?rev=737942&view=rev
Log:
TIKA-95: Pluggable magic header detectors

Added a TextDetector class for detecting plain text documents.

Added:
    lucene/tika/trunk/src/main/java/org/apache/tika/detect/TextDetector.java
    lucene/tika/trunk/src/test/java/org/apache/tika/detect/TextDetectorTest.java

Added: lucene/tika/trunk/src/main/java/org/apache/tika/detect/TextDetector.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/detect/TextDetector.java?rev=737942&view=auto
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/detect/TextDetector.java (added)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/detect/TextDetector.java Mon Jan 26 23:57:17 2009
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detection of plain text documents. This detector looks at the
+ * beginning of the document input stream and considers the document to be
+ * a text document if no ASCII (ISO-Latin-1, UTF-8, etc.) control bytes are
+ * found.
+ * <p>
+ * Note that text documents with a character encoding like UTF-16 are better
+ * detected with {@link MagicDetector} and an appropriate magic byte pattern.
+ *
+ * @since Apache Tika 0.3
+ */
+public class TextDetector implements Detector {
+
+    /**
+     * The number of bytes from the beginning of the document stream
+     * to test for control bytes.
+     */
+    private static final int NUMBER_OF_BYTES_TO_TEST = 512;
+
+    /**
+     * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
+     * in the range below 0x20 (the space character). If an entry in this
+     * table is <code>true</code> then that byte is very unlikely to occur
+     * in a plain text document.
+     * <p>
+     * The contents of this lookup table are based on the following definition
+     * from section 4 of the "Content-Type Processing Model" Internet-draft
+     * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
+     * >draft-abarth-mime-sniff-01</a>).
+     * <pre>
+     * +-------------------------+
+     * | Binary data byte ranges |
+     * +-------------------------+
+     * | 0x00 -- 0x08            |
+     * | 0x0B                    |
+     * | 0x0E -- 0x1A            |
+     * | 0x1C -- 0x1F            |
+     * +-------------------------+
+     * </pre>
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
+     */
+    private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20];
+
+    static {
+        Arrays.fill(IS_CONTROL_BYTE, true);
+        IS_CONTROL_BYTE[0x09] = false; // tabulator
+        IS_CONTROL_BYTE[0x0A] = false; // new line
+        IS_CONTROL_BYTE[0x0C] = false; // new page
+        IS_CONTROL_BYTE[0x0D] = false; // carriage return
+        IS_CONTROL_BYTE[0x1B] = false; // escape
+    }
+
+    /**
+     * Looks at the beginning of the document input stream to determine
+     * whether the document is text or not.
+     *
+     * @param input document input stream
+     * @param metadata ignored
+     * @return "text/plain" if the input stream suggest a text document,
+     *         "application/octet-stream" otherwise
+     */
+    public MediaType detect(InputStream input, Metadata metadata)
+            throws IOException {
+        for (int i = 0; i < NUMBER_OF_BYTES_TO_TEST; i++) {
+            int ch = input.read();
+            if (ch == -1) {
+                return MediaType.TEXT_PLAIN;
+            } else if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
+                return MediaType.OCTET_STREAM;
+            }
+        }
+        return MediaType.TEXT_PLAIN;
+    }
+
+}

Added: lucene/tika/trunk/src/test/java/org/apache/tika/detect/TextDetectorTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/detect/TextDetectorTest.java?rev=737942&view=auto
==============================================================================
--- lucene/tika/trunk/src/test/java/org/apache/tika/detect/TextDetectorTest.java (added)
+++ lucene/tika/trunk/src/test/java/org/apache/tika/detect/TextDetectorTest.java Mon Jan 26 23:57:17 2009
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Arrays;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Test cases for the {@link TextDetector} class.
+ */
+public class TextDetectorTest extends TestCase {
+
+    private final Detector detector = new TextDetector();
+
+    public void testDetectText() throws Exception {
+        assertText(new byte[0]);
+
+        assertText("Hello, World!".getBytes("UTF-8"));
+        assertText(" \t\r\n".getBytes("UTF-8"));
+        assertText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B });
+        assertNotText(new byte[] { 0 });
+        assertNotText(new byte[] { 'H', 'e', 'l', 'l', 'o', 0 });
+
+        byte[] data = new byte[512];
+        Arrays.fill(data, (byte) '.');
+        assertText(data);
+        Arrays.fill(data, (byte) 0x1f);
+        assertNotText(data);
+
+        data = new byte[513];
+        Arrays.fill(data, (byte) '.');
+        assertText(data);
+        Arrays.fill(data, (byte) 0x1f);
+        assertNotText(data);
+    }
+
+    private void assertText(byte[] data) {
+        try {
+            assertEquals(
+                    MediaType.TEXT_PLAIN,
+                    detector.detect(
+                            new ByteArrayInputStream(data), new Metadata()));
+        } catch (IOException e) {
+            fail("Unexpected exception from TextDetector");
+        }
+    }
+
+    private void assertNotText(byte[] data) {
+        try {
+            assertEquals(
+                    MediaType.OCTET_STREAM,
+                    detector.detect(
+                            new ByteArrayInputStream(data), new Metadata()));
+        } catch (IOException e) {
+            fail("Unexpected exception from TextDetector");
+        }
+    }
+
+}