You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/01/27 00:57:17 UTC
svn commit: r737942 - in /lucene/tika/trunk/src:
main/java/org/apache/tika/detect/TextDetector.java
test/java/org/apache/tika/detect/TextDetectorTest.java
Author: jukka
Date: Mon Jan 26 23:57:17 2009
New Revision: 737942
URL: http://svn.apache.org/viewvc?rev=737942&view=rev
Log:
TIKA-95: Pluggable magic header detectors
Added a TextDetector class for detecting plain text documents.
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/detect/TextDetector.java
lucene/tika/trunk/src/test/java/org/apache/tika/detect/TextDetectorTest.java
Added: lucene/tika/trunk/src/main/java/org/apache/tika/detect/TextDetector.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/detect/TextDetector.java?rev=737942&view=auto
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/detect/TextDetector.java (added)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/detect/TextDetector.java Mon Jan 26 23:57:17 2009
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detection of plain text documents. This detector looks at the
+ * beginning of the document input stream and considers the document to be
+ * a text document if no ASCII (ISO-Latin-1, UTF-8, etc.) control bytes are
+ * found.
+ * <p>
+ * Note that text documents with a character encoding like UTF-16 are better
+ * detected with {@link MagicDetector} and an appropriate magic byte pattern.
+ *
+ * @since Apache Tika 0.3
+ */
+public class TextDetector implements Detector {
+
+ /**
+ * The number of bytes from the beginning of the document stream
+ * to test for control bytes.
+ */
+ private static final int NUMBER_OF_BYTES_TO_TEST = 512;
+
+ /**
+ * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
+ * in the range below 0x20 (the space character). If an entry in this
+ * table is <code>true</code> then that byte is very unlikely to occur
+ * in a plain text document.
+ * <p>
+ * The contents of this lookup table are based on the following definition
+ * from section 4 of the "Content-Type Processing Model" Internet-draft
+ * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
+ * >draft-abarth-mime-sniff-01</a>).
+ * <pre>
+ * +-------------------------+
+ * | Binary data byte ranges |
+ * +-------------------------+
+ * | 0x00 -- 0x08 |
+ * | 0x0B |
+ * | 0x0E -- 0x1A |
+ * | 0x1C -- 0x1F |
+ * +-------------------------+
+ * </pre>
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
+ */
+ private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20];
+
+ static {
+ Arrays.fill(IS_CONTROL_BYTE, true);
+ IS_CONTROL_BYTE[0x09] = false; // tabulator
+ IS_CONTROL_BYTE[0x0A] = false; // new line
+ IS_CONTROL_BYTE[0x0C] = false; // new page
+ IS_CONTROL_BYTE[0x0D] = false; // carriage return
+ IS_CONTROL_BYTE[0x1B] = false; // escape
+ }
+
+ /**
+ * Looks at the beginning of the document input stream to determine
+ * whether the document is text or not.
+ *
+ * @param input document input stream
+ * @param metadata ignored
+ * @return "text/plain" if the input stream suggest a text document,
+ * "application/octet-stream" otherwise
+ */
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ for (int i = 0; i < NUMBER_OF_BYTES_TO_TEST; i++) {
+ int ch = input.read();
+ if (ch == -1) {
+ return MediaType.TEXT_PLAIN;
+ } else if (ch < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[ch]) {
+ return MediaType.OCTET_STREAM;
+ }
+ }
+ return MediaType.TEXT_PLAIN;
+ }
+
+}
Added: lucene/tika/trunk/src/test/java/org/apache/tika/detect/TextDetectorTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/detect/TextDetectorTest.java?rev=737942&view=auto
==============================================================================
--- lucene/tika/trunk/src/test/java/org/apache/tika/detect/TextDetectorTest.java (added)
+++ lucene/tika/trunk/src/test/java/org/apache/tika/detect/TextDetectorTest.java Mon Jan 26 23:57:17 2009
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Arrays;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Test cases for the {@link TextDetector} class.
+ */
+public class TextDetectorTest extends TestCase {
+
+ private final Detector detector = new TextDetector();
+
+ public void testDetectText() throws Exception {
+ assertText(new byte[0]);
+
+ assertText("Hello, World!".getBytes("UTF-8"));
+ assertText(" \t\r\n".getBytes("UTF-8"));
+ assertText(new byte[] { -1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B });
+ assertNotText(new byte[] { 0 });
+ assertNotText(new byte[] { 'H', 'e', 'l', 'l', 'o', 0 });
+
+ byte[] data = new byte[512];
+ Arrays.fill(data, (byte) '.');
+ assertText(data);
+ Arrays.fill(data, (byte) 0x1f);
+ assertNotText(data);
+
+ data = new byte[513];
+ Arrays.fill(data, (byte) '.');
+ assertText(data);
+ Arrays.fill(data, (byte) 0x1f);
+ assertNotText(data);
+ }
+
+ private void assertText(byte[] data) {
+ try {
+ assertEquals(
+ MediaType.TEXT_PLAIN,
+ detector.detect(
+ new ByteArrayInputStream(data), new Metadata()));
+ } catch (IOException e) {
+ fail("Unexpected exception from TextDetector");
+ }
+ }
+
+ private void assertNotText(byte[] data) {
+ try {
+ assertEquals(
+ MediaType.OCTET_STREAM,
+ detector.detect(
+ new ByteArrayInputStream(data), new Metadata()));
+ } catch (IOException e) {
+ fail("Unexpected exception from TextDetector");
+ }
+ }
+
+}