You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2007/09/21 17:07:59 UTC
svn commit: r578161 [1/2] - in /incubator/tika/trunk: ./
src/main/java/org/apache/tika/metadata/ src/main/java/org/apache/tika/mime/
src/main/java/org/apache/tika/utils/ src/main/resources/mime/
src/test/java/org/apache/tika/mime/
Author: mattmann
Date: Fri Sep 21 08:07:58 2007
New Revision: 578161
URL: http://svn.apache.org/viewvc?rev=578161&view=rev
Log:
- patch for TIKA-6
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/mime/
incubator/tika/trunk/src/main/java/org/apache/tika/mime/Clause.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/HexCoDec.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/Magic.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicClause.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicMatch.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypeException.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/Operator.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configurable.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configuration.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/StringUtil.java
incubator/tika/trunk/src/main/resources/mime/
incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
incubator/tika/trunk/src/test/java/org/apache/tika/mime/
incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/pom.xml
incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=578161&r1=578160&r2=578161&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Fri Sep 21 08:07:58 2007
@@ -28,3 +28,6 @@
12. TIKA-18 - "Office" interface should be renamed "MSOffice" (mattmann)
13. TIKA-23 - Decouple Parser from ParserConfig (jukka)
+
+14. TIKA-6 - Port Nutch (or better) MimeType detection system into Tika (J. Charron & mattmann)
+
Modified: incubator/tika/trunk/pom.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/pom.xml?rev=578161&r1=578160&r2=578161&view=diff
==============================================================================
--- incubator/tika/trunk/pom.xml (original)
+++ incubator/tika/trunk/pom.xml Fri Sep 21 08:07:58 2007
@@ -149,6 +149,16 @@
<optional/>
</dependency>
<dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <version>1.0.4</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>1.3</version>
+ </dependency>
+ <dependency>
<groupId>pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>0.7.3</version>
@@ -217,6 +227,11 @@
<include name="README.txt"/>
<include name="NOTICE.txt"/>
<include name="LICENSE.txt"/>
+ </fileset>
+ </copy>
+ <copy todir="${project.build.outputDirectory}/org/apache/tika/mime">
+ <fileset dir="${basedir}/src/main/resources/mime">
+ <include name="*"/>
</fileset>
</copy>
</tasks>
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java?rev=578161&r1=578160&r2=578161&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java Fri Sep 21 08:07:58 2007
@@ -30,7 +30,7 @@
*
*/
public class Metadata implements CreativeCommons, DublinCore, HttpHeaders,
- MSOffice {
+ MSOffice, TikaMimeKeys {
/**
* A map of all metadata attributes.
Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/Clause.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/Clause.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/Clause.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/Clause.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+/**
+ * Defines a clause to be evaluated.
+ *
+ * @author Jerome Charron
+ */
+interface Clause {
+
+ /** A clause that is always true. */
+ final static Clause TRUE = new True();
+
+ /** A clause that is always false. */
+ final static Clause FALSE = new False();
+
+ /**
+ * Evaluates this clause with the specified chunk of data.
+ */
+ public boolean eval(byte[] data);
+
+ /**
+ * Returns the size of this clause. The size of a clause is the number of
+ * chars it is composed of.
+ */
+ public int size();
+
+ final static class False implements Clause {
+ public boolean eval(byte[] data) {
+ return false;
+ }
+
+ public int size() {
+ return 0;
+ }
+
+ public String toString() {
+ return "FALSE";
+ }
+ }
+
+ final static class True implements Clause {
+ public boolean eval(byte[] data) {
+ return true;
+ }
+
+ public int size() {
+ return 0;
+ }
+
+ public String toString() {
+ return "TRUE";
+ }
+ }
+
+}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/HexCoDec.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/HexCoDec.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/HexCoDec.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/HexCoDec.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,117 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+/**
+ *
+ * @author Jérôme Charron
+ */
+public class HexCoDec {
+
+ private final static char[] HEX_CHARS = { '0', '1', '2', '3', '4', '5',
+ '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
+
+ /**
+ * Decode a hex string
+ *
+ * @param hexValue
+ * the string of hex characters
+ * @return the decode hex string as bytes.
+ */
+ public static byte[] decode(String hexValue) {
+ return decode(hexValue.toCharArray());
+ }
+
+ /**
+ * Decode an array of hex chars
+ *
+ * @param hexChars
+ * an array of hex characters.
+ * @return the decode hex chars as bytes.
+ */
+ public static byte[] decode(char[] hexChars) {
+ return decode(hexChars, 0, hexChars.length);
+ }
+
+ /**
+ * Decode an array of hex chars.
+ *
+ * @param hexChars
+ * an array of hex characters.
+ * @param starIndex
+ * the index of the first character to decode
+ * @param length
+ * the number of characters to decode.
+ * @return the decode hex chars as bytes.
+ */
+ public static byte[] decode(char[] hexChars, int startIndex, int length) {
+ if ((length & 1) != 0)
+ throw new IllegalArgumentException("Length must be even");
+
+ byte[] result = new byte[length / 2];
+ for (int j = 0; j < result.length; j++) {
+ result[j] = (byte) (hexCharToNibble(hexChars[startIndex++]) * 16 + hexCharToNibble(hexChars[startIndex++]));
+ }
+ return result;
+ }
+
+ /**
+ * Hex encode an array of bytes
+ *
+ * @param bites
+ * the array of bytes to encode.
+ * @return the array of hex characters.
+ */
+ public static char[] encode(byte[] bites) {
+ return encode(bites, 0, bites.length);
+ }
+
+ /**
+ * Hex encode an array of bytes
+ *
+ * @param bites
+ * the array of bytes to encode.
+ * @param starIndex
+ * the index of the first character to encode.
+ * @param length
+ * the number of characters to encode.
+ * @return the array of hex characters.
+ */
+ public static char[] encode(byte[] bites, int startIndex, int length) {
+ char[] result = new char[length * 2];
+ for (int i = 0, j = 0; i < length; i++) {
+ int bite = bites[startIndex++] & 0xff;
+ result[j++] = HEX_CHARS[bite >> 4];
+ result[j++] = HEX_CHARS[bite & 0xf];
+ }
+ return result;
+ }
+
+ /**
+ * Internal method to turn a hex char into a nibble.
+ */
+ private static int hexCharToNibble(char ch) {
+ if ((ch >= '0') && (ch <= '9'))
+ return ch - '0';
+ else if ((ch >= 'a') && (ch <= 'f'))
+ return ch - 'a' + 10;
+ else if ((ch >= 'A') && (ch <= 'F'))
+ return ch - 'A' + 10;
+ else
+ throw new IllegalArgumentException("Not a hex char - '" + ch + "'");
+ }
+
+}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/Magic.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/Magic.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/Magic.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/Magic.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,69 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+/**
+ * Defines a magic for a MimeType. A magic is made of one or several
+ * MagicClause.
+ *
+ * @author Jérôme Charron
+ */
+class Magic implements Clause {
+
+ private MimeType type = null;
+
+ private int priority = 50;
+
+ private Clause clause = null;
+
+ Magic() {
+ this(50);
+ }
+
+ Magic(int priority) {
+ this.priority = priority;
+ }
+
+ void setType(MimeType type) {
+ this.type = type;
+ }
+
+ MimeType getType() {
+ return type;
+ }
+
+ int getPriority() {
+ return priority;
+ }
+
+ void setClause(Clause clause) {
+ this.clause = clause;
+ }
+
+ public boolean eval(byte[] data) {
+ return clause.eval(data);
+ }
+
+ public int size() {
+ return clause.size();
+ }
+
+ public String toString() {
+ StringBuffer buf = new StringBuffer();
+ buf.append("[").append(priority).append("/").append(clause).append("]");
+ return buf.toString();
+ }
+}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicClause.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicClause.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicClause.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicClause.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+/**
+ * Defines a MagicClause.
+ *
+ * @author Jér⊚me Charron
+ */
+class MagicClause implements Clause {
+
+ private Operator op = null;
+
+ private Clause c1 = null;
+
+ private Clause c2 = null;
+
+ private int size = 0;
+
+ MagicClause(Operator op, Clause c1, Clause c2) {
+ this.op = op;
+ this.c1 = c1;
+ this.c2 = c2;
+ this.size = c1.size() + c2.size();
+ }
+
+ public boolean eval(byte[] data) {
+ return op.eval(c1.eval(data), c2.eval(data));
+ }
+
+ public int size() {
+ return size;
+ }
+
+ public String toString() {
+ return new StringBuffer().append("(").append(c1).append(" ").append(op)
+ .append(" ").append(c2).append(")").toString();
+ }
+}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicMatch.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicMatch.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicMatch.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicMatch.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,216 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// JDK imports
+import java.io.ByteArrayOutputStream;
+import java.math.BigInteger;
+
+// Jakarta Commons Codec imports
+import org.apache.commons.codec.DecoderException;
+import org.apache.commons.codec.binary.Hex;
+
+/**
+ * Defines a magic match.
+ *
+ * @author Jérôme Charron
+ */
+class MagicMatch implements Clause {
+
+ private final static Hex HEX_CODEC = new Hex();
+
+ private int offsetStart;
+
+ private int offsetEnd;
+
+ private String type;
+
+ private BigInteger mask;
+
+ private BigInteger value;
+
+ private int length;
+
+ MagicMatch(int offsetStart, int offsetEnd, String type, String mask,
+ String value) throws MimeTypeException {
+
+ this.offsetStart = offsetStart;
+ this.offsetEnd = offsetEnd;
+ this.type = type;
+ try {
+ byte[] decoded = decodeValue(type, value);
+ this.length = decoded.length;
+ this.value = new BigInteger(decoded);
+ if (mask != null) {
+ this.mask = new BigInteger(decodeValue(type, mask));
+ this.value = this.value.and(this.mask);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw new MimeTypeException(e);
+ }
+ }
+
+ private byte[] decodeValue(String type, String value)
+ throws DecoderException {
+
+ // Preliminary check
+ if ((value == null) || (type == null)) {
+ return null;
+ }
+
+ byte[] decoded = null;
+ String tmpVal = null;
+ int radix = 8;
+
+ // hex
+ if (value.startsWith("0x")) {
+ tmpVal = value.substring(2);
+ radix = 16;
+ } else {
+ tmpVal = value;
+ radix = 8;
+ }
+
+ if (type.equals("string")) {
+ decoded = decodeString(value);
+
+ } else if (type.equals("byte")) {
+ decoded = tmpVal.getBytes();
+
+ } else if (type.equals("host16") || type.equals("little16")) {
+ int i = Integer.parseInt(tmpVal, radix);
+ decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
+
+ } else if (type.equals("big16")) {
+ int i = Integer.parseInt(tmpVal, radix);
+ decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
+
+ } else if (type.equals("host32") || type.equals("little32")) {
+ long i = Long.parseLong(tmpVal, radix);
+ decoded = new byte[] { (byte) ((i & 0x000000FF)),
+ (byte) ((i & 0x0000FF00) >> 8),
+ (byte) ((i & 0x00FF0000) >> 16),
+ (byte) ((i & 0xFF000000) >> 24) };
+
+ } else if (type.equals("big32")) {
+ long i = Long.parseLong(tmpVal, radix);
+ decoded = new byte[] { (byte) ((i & 0xFF000000) >> 24),
+ (byte) ((i & 0x00FF0000) >> 16),
+ (byte) ((i & 0x0000FF00) >> 8), (byte) ((i & 0x000000FF)) };
+ }
+ return decoded;
+ }
+
+ private byte[] decodeString(String value) throws DecoderException {
+
+ if (value.startsWith("0x")) {
+ return HEX_CODEC.decode(value.substring(2).getBytes());
+ }
+
+ try {
+ ByteArrayOutputStream decoded = new ByteArrayOutputStream();
+
+ for (int i = 0; i < value.length(); i++) {
+ if (value.charAt(i) == '\\') {
+ if (value.charAt(i + 1) == '\\') {
+ decoded.write('\\');
+ i++;
+ } else if (value.charAt(i + 1) == 'x') {
+ decoded.write(HEX_CODEC.decode(value.substring(i + 2,
+ i + 4).getBytes()));
+ i += 3;
+ } else {
+ int j = i + 1;
+ while ((j < i + 4) && (j < value.length())
+ && (Character.isDigit(value.charAt(j)))) {
+ j++;
+ }
+ decoded.write(Short.decode(
+ "0" + value.substring(i + 1, j)).byteValue());
+ i = j - 1;
+ }
+ } else {
+ decoded.write(value.charAt(i));
+ }
+ }
+ return decoded.toByteArray();
+ } catch (Exception e) {
+ throw new DecoderException(e.toString() + " for " + value);
+ }
+ }
+
+ public boolean eval(byte[] data) {
+
+ boolean ok = false;
+ for (int i = offsetStart; i <= offsetEnd; i++) {
+ if (data.length < (this.length + i)) {
+ // Not enough data...
+ return false;
+ }
+ byte[] array = new byte[this.length];
+ System.arraycopy(data, i, array, 0, this.length);
+ BigInteger content = new BigInteger(array);
+ // System.out.println("Evaluating " + content);
+ if (mask != null) {
+ content = content.and(mask);
+ }
+ if (value.equals(content)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public int size() {
+ return length;
+ }
+
+ public String toString() {
+ return new StringBuffer().append("[").append(offsetStart).append(":")
+ .append(offsetEnd).append("(").append(type).append(")").append(
+ "-").append(mask).append("#").append(value).append("]")
+ .toString();
+ }
+
+ private final static boolean equals(byte[] b1, byte[] b2) {
+ if ((b1 != null) && (b2 != null)) {
+ if (b1.length != b2.length) {
+ return false;
+ }
+ for (int i = 0; i < b1.length; i++) {
+ if (b1[i] != b2[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+ if ((b1 == null) && (b2 == null)) {
+ return true;
+ }
+ return false;
+ }
+
+ private final static String toHexString(byte[] bytes) {
+ StringBuffer buf = new StringBuffer();
+ for (int i = 0; i < bytes.length; i++) {
+ String str = Integer.toHexString(bytes[i]);
+ buf.append((str.length() > 2) ? str.substring(str.length() - 2)
+ : str);
+ }
+ return buf.toString();
+ }
+}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,470 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// JDK imports
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.regex.Pattern;
+import org.apache.tika.utils.StringUtil;
+
+/**
+ * Defines a Mime Content Type.
+ *
+ * @author Jérôme Charron
+ * @author Hari Kodungallur
+ */
+public final class MimeType {
+
+ /** The primary and sub types separator */
+ private final static String SEPARATOR = "/";
+
+ /** The parameters separator */
+ private final static String PARAMS_SEP = ";";
+
+ /** Special characters not allowed in content types. */
+ private final static String SPECIALS = "()<>@,;:\\\"/[]?=";
+
+ /** The Mime-Type full name */
+ private String name = null;
+
+ /** The Mime-Type primary type */
+ private String primary = null;
+
+ /** The Mime-Type sub type */
+ private String sub = null;
+
+ /** The Mime-Type description */
+ private String description = null;
+
+ /** The Mime-Type associated recognition patterns */
+ private Patterns patterns = null;
+
+ /** The magics associated to this Mime-Type */
+ private ArrayList magics = null;
+
+ /** The aliases Mime-Types for this one */
+ private ArrayList aliases = null;
+
+ /** The root-XML associated to this Mime-Type */
+ private ArrayList rootXML = null;
+
+ /** The sub-class-of associated to this Mime-Type */
+ private ArrayList superTypes = null;
+
+ /** The mime-type level (regarding its subTypes) */
+ private int level = 0;
+
+ /** The minimum length of data to provides for magic analyzis */
+ private int minLength = 0;
+
+ /**
+ * Creates a MimeType from a String.
+ *
+ * @param name
+ * the MIME content type String.
+ */
+ public MimeType(String name) throws MimeTypeException {
+
+ if (name == null || name.length() <= 0) {
+ throw new MimeTypeException("The type can not be null or empty");
+ }
+
+ // Split the two parts of the Mime Content Type
+ String[] parts = name.split(SEPARATOR, 2);
+
+ // Checks validity of the parts
+ if (parts.length != 2) {
+ throw new MimeTypeException("Invalid Content Type " + name);
+ }
+ init(parts[0], parts[1]);
+ }
+
+ /**
+ * Creates a MimeType with the given primary type and sub type.
+ *
+ * @param primary
+ * the content type primary type.
+ * @param sub
+ * the content type sub type.
+ */
+ public MimeType(String primary, String sub) throws MimeTypeException {
+ init(primary, sub);
+ }
+
+ /** Init method used by constructors. */
+ private void init(String primary, String sub) throws MimeTypeException {
+
+ // Preliminary checks...
+ if ((primary == null) || (primary.length() <= 0) || (!isValid(primary))) {
+ throw new MimeTypeException("Invalid Primary Type " + primary);
+ }
+ // Remove optional parameters from the sub type
+ String clearedSub = null;
+ if (sub != null) {
+ clearedSub = sub.split(PARAMS_SEP)[0];
+ }
+ if ((clearedSub == null) || (clearedSub.length() <= 0)
+ || (!isValid(clearedSub))) {
+ throw new MimeTypeException("Invalid Sub Type " + clearedSub);
+ }
+
+ // All is ok, assign values
+ this.primary = primary.toLowerCase().trim();
+ this.sub = clearedSub.toLowerCase().trim();
+ this.name = this.primary + SEPARATOR + this.sub;
+ this.patterns = new Patterns();
+ this.magics = new ArrayList();
+ this.aliases = new ArrayList();
+ this.rootXML = new ArrayList();
+ this.superTypes = new ArrayList();
+ }
+
+ /**
+ * Cleans a content-type. This method cleans a content-type by removing its
+ * optional parameters and returning only its
+ * <code>primary-type/sub-type</code>.
+ *
+ * @param type
+ * is the content-type to clean.
+ * @return the cleaned version of the specified content-type.
+ * @throws MimeTypeException
+ * if something wrong occurs during the parsing/cleaning of the
+ * specified type.
+ */
+ public final static String clean(String type) throws MimeTypeException {
+ return (new MimeType(type)).getName();
+ }
+
+ /**
+ * Return the name of this mime-type.
+ *
+ * @return the name of this mime-type.
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * Return the primary type of this mime-type.
+ *
+ * @return the primary type of this mime-type.
+ */
+ public String getPrimaryType() {
+ return primary;
+ }
+
+ /**
+ * Return the sub type of this mime-type.
+ *
+ * @return the sub type of this mime-type.
+ */
+ public String getSubType() {
+ return sub;
+ }
+
+ // Inherited Javadoc
+ public String toString() {
+ StringBuffer buf = new StringBuffer();
+ buf.append(name).append(" -- ").append(getDescription()).append("\n")
+ .append("Aliases: ");
+ if (aliases.size() < 1) {
+ buf.append(" NONE");
+ }
+ buf.append("\n");
+ for (int i = 0; i < aliases.size(); i++) {
+ buf.append("\t").append((String) aliases.get(i)).append("\n");
+ }
+ buf.append("Patterns:");
+ String[] patterns = this.patterns.getPatterns();
+ if (patterns.length < 1) {
+ buf.append(" NONE");
+ }
+ buf.append("\n");
+ for (int i = 0; i < patterns.length; i++) {
+ buf.append("\t").append(patterns[i]).append("\n");
+ }
+ buf.append("Magics: ");
+ if (magics.size() < 1) {
+ buf.append(" NONE");
+ }
+ buf.append("\n");
+ for (int i = 0; i < magics.size(); i++) {
+ buf.append("\t").append((Magic) magics.get(i)).append("\n");
+ }
+
+ return buf.toString();
+ }
+
+ /**
+ * Indicates if an object is equal to this mime-type. The specified object
+ * is equal to this mime-type if it is not null, and it is an instance of
+ * MimeType and its name is equals to this mime-type.
+ *
+ * @param object
+ * the reference object with which to compare.
+ * @return <code>true</code> if this mime-type is equal to the object
+ * argument; <code>false</code> otherwise.
+ */
+ public boolean equals(Object object) {
+ try {
+ return ((MimeType) object).getName().equals(this.name);
+ } catch (Exception e) {
+ return false;
+ }
+ }
+
+ // Inherited Javadoc
+ public int hashCode() {
+ return name.hashCode();
+ }
+
+ /**
+ * Return the description of this mime-type.
+ *
+ * @return the description of this mime-type.
+ */
+ public String getDescription() {
+ return description;
+ }
+
+ /**
+ * Set the description of this mime-type.
+ *
+ * @param description
+ * the description of this mime-type.
+ */
+ void setDescription(String description) {
+ this.description = description;
+ }
+
+ /**
+ * Add a supported file-naming pattern.
+ *
+ * @param pattern
+ * to add to the list of recognition pattern for this mime-type.
+ */
+ void addPattern(String pattern) {
+ patterns.add(pattern, this);
+ }
+
+ /**
+ * Return the recogition patterns for this mime-type
+ *
+ * @return the recoginition patterns associated to this mime-type.
+ */
+ String[] getPatterns() {
+ return patterns.getPatterns();
+ }
+
+ /**
+ * Add an alias to this mime-type
+ *
+ * @param alias
+ * to add to this mime-type.
+ */
+ void addAlias(String alias) {
+ aliases.add(alias);
+ }
+
+ /**
+ * Add some rootXML info to this mime-type
+ *
+ * @param namespaceURI
+ * @param localName
+ */
+ void addRootXML(String namespaceURI, String localName) {
+ rootXML.add(new RootXML(this, namespaceURI, localName));
+ }
+
+ boolean matchesXML(byte[] data) {
+ RootXML xml = null;
+ String content = new String(data);
+ for (int i = 0; i < rootXML.size(); i++) {
+ xml = (RootXML) rootXML.get(i);
+ if (xml.matches(content)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ boolean hasRootXML() {
+ return (rootXML.size() > 0);
+ }
+
+ RootXML[] getRootXMLs() {
+ return (RootXML[]) rootXML.toArray(new RootXML[rootXML.size()]);
+ }
+
+ void addSuperType(String type) {
+ superTypes.add(type);
+ }
+
+ boolean hasSuperType() {
+ return (superTypes.size() > 0);
+ }
+
+ /**
+ * Returns the super types of this mime-type. A type is a super type of
+ * another type if any instance of the second type is also an instance of
+ * the first.
+ */
+ public String[] getSuperTypes() {
+ return (String[]) superTypes.toArray(new String[superTypes.size()]);
+ }
+
+ int getLevel() {
+ return level;
+ }
+
+ void incLevel() {
+ this.level++;
+ }
+
+ /**
+ * Return the recogition patterns for this mime-type
+ *
+ * @return the recoginition patterns associated to this mime-type.
+ */
+ public String[] getAliases() {
+ return (String[]) aliases.toArray(new String[aliases.size()]);
+ }
+
+ Magic[] getMagics() {
+ return (Magic[]) magics.toArray(new Magic[magics.size()]);
+ }
+
+ void addMagic(Magic magic) {
+ if (magic == null) {
+ return;
+ }
+ magics.add(magic);
+ Collections.sort(magics, MimeTypes.MAGICS_COMPARATOR);
+ }
+
+ int getMinLength() {
+ return minLength;
+ }
+
+ public boolean hasMagic() {
+ return (magics.size() > 0);
+ }
+
+ public boolean matches(String url) {
+ return (patterns.matches(url) == this);
+ }
+
+ public boolean matchesMagic(byte[] data) {
+ for (int i = 0; i < magics.size(); i++) {
+ Magic magic = (Magic) magics.get(i);
+ if (magic.eval(data)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public boolean matches(byte[] data) {
+ return matchesXML(data) || matchesMagic(data);
+ }
+
+ /** Checks if the specified primary or sub type is valid. */
+ private boolean isValid(String type) {
+ return (type != null) && (type.trim().length() > 0)
+ && !hasCtrlOrSpecials(type);
+ }
+
+ /** Checks if the specified string contains some special characters. */
+ private boolean hasCtrlOrSpecials(String type) {
+ int len = type.length();
+ int i = 0;
+ while (i < len) {
+ char c = type.charAt(i);
+ if (c <= '\032' || SPECIALS.indexOf(c) > 0) {
+ return true;
+ }
+ i++;
+ }
+ return false;
+ }
+
+ /**
+ * Defines a RootXML description. RootXML is made of a localName and/or a
+ * namespaceURI.
+ */
+ class RootXML {
+
+ private final static int PATTERN_FLAGS = Pattern.CASE_INSENSITIVE
+ | Pattern.DOTALL | Pattern.MULTILINE;
+
+ private MimeType type = null;
+
+ private String namespaceURI = null;
+
+ private String localName = null;
+
+ private Pattern pattern = null;
+
+ RootXML(MimeType type, String namespaceURI, String localName) {
+ this.type = type;
+ this.namespaceURI = namespaceURI;
+ this.localName = localName;
+ if ((StringUtil.isEmpty(namespaceURI))
+ && (StringUtil.isEmpty(localName))) {
+ throw new IllegalArgumentException(
+ "Both namespaceURI and localName cannot be null");
+ }
+ String regex = null;
+ if (StringUtil.isEmpty(namespaceURI)) {
+ regex = ".*<" + localName + "[^<>]*>.*";
+ } else if (StringUtil.isEmpty(localName)) {
+ regex = ".*<[^<>]*\\p{Space}xmlns=[\"\']?" + namespaceURI
+ + "[\"\']?[^<>]*>.*";
+ } else {
+ regex = ".*<" + localName + "[^<>]*\\p{Space}xmlns=[\"\']?"
+ + namespaceURI + "[\"\']?[^<>]*>.*";
+ }
+ this.pattern = Pattern.compile(regex, PATTERN_FLAGS);
+ }
+
+ boolean matches(byte[] data) {
+ return matches(new String(data));
+ }
+
+ boolean matches(String data) {
+ return pattern.matcher(data).matches();
+ }
+
+ MimeType getType() {
+ return type;
+ }
+
+ String getNameSpaceURI() {
+ return namespaceURI;
+ }
+
+ String getLocalName() {
+ return localName;
+ }
+
+ public String toString() {
+ return new StringBuffer().append(type.getName()).append(", ")
+ .append(namespaceURI).append(", ").append(localName)
+ .toString();
+ }
+ }
+}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypeException.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypeException.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypeException.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypeException.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+/**
+ * A class to encapsulate MimeType related exceptions.
+ *
+ * @author Hari Kodungallur
+ * @author Jérôme Charron
+ */
+public class MimeTypeException extends Exception {
+
+ /**
+ * Constructs a MimeTypeException with no specified detail message.
+ */
+ public MimeTypeException() {
+ super();
+ }
+
+ /**
+ * Constructs a MimeTypeException with the specified detail message.
+ *
+ * @param msg
+ * the detail message.
+ */
+ public MimeTypeException(String msg) {
+ super(msg);
+ }
+
+ /**
+ * Constructs a MimeTypeException with the specified cause.
+ *
+ * @param t
+ * the cause.
+ */
+ public MimeTypeException(Throwable t) {
+ super(t);
+ }
+}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,413 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// JDK imports
+import java.io.File;
+import java.net.URL;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.commons.logging.LogFactory;
+import org.w3c.dom.Document;
+
+// Commons Logging imports
+import org.apache.commons.logging.Log;
+
+/**
+ * This class is a MimeType repository. It gathers a set of MimeTypes and
+ * enables to retrieves a content-type from its name, from a file name, or from
+ * a magic character sequence.
+ *
+ * @author Jérôme Charron
+ */
+public final class MimeTypes {
+
+ /** The default <code>application/octet-stream</code> MimeType */
+ public final static String DEFAULT = "application/octet-stream";
+
+ /** My logger */
+ private Log logger = null;
+
+ /** All the registered MimeTypes indexed on their name */
+ private Map types = new HashMap();
+
+ /** The patterns matcher */
+ private Patterns patterns = new Patterns();
+
+ /** List of all registered magics */
+ private ArrayList magics = new ArrayList();
+
+ /** List of all registered rootXML */
+ private ArrayList xmls = new ArrayList();
+
+ private Map unsolvedDeps = new HashMap();
+
+ /**
+ * A comparator used to sort the mime types based on their magics (it is
+ * sorted first on the magic's priority, then on the magic's size).
+ */
+ final static Comparator MAGICS_COMPARATOR = new Comparator() {
+ public int compare(Object o1, Object o2) {
+ Magic m1 = (Magic) o1;
+ Magic m2 = (Magic) o2;
+ int p1 = m1.getPriority();
+ int p2 = m2.getPriority();
+ if (p1 != p2) {
+ return p2 - p1;
+ }
+ return m2.size() - m1.size();
+ }
+ };
+
+ /**
+ * A comparator used to sort the mime types based on their level (the level
+ * is the number of super-types for a type)
+ */
+ private final static Comparator LEVELS_COMPARATOR = new Comparator() {
+ public int compare(Object o1, Object o2) {
+ return ((MimeInfo) o2).getLevel() - ((MimeInfo) o1).getLevel();
+ }
+ };
+
+ /** The minimum length of data to provide to check all MimeTypes */
+ private int minLength = 0;
+
+ /**
+ * Creates a new MimeTypes instance.
+ *
+ * @param filepath
+ * is the mime-types definitions xml file.
+ * @param logger
+ * is it Logger to uses for ouput messages.
+ */
+ public MimeTypes(String filepath, Log logger) {
+ if (logger == null) {
+ this.logger = LogFactory.getLog(this.getClass());
+ } else {
+ this.logger = logger;
+ }
+ MimeTypesReader reader = new MimeTypesReader(logger);
+ add(reader.read(filepath));
+ }
+
+ /**
+ * Creates a new MimeTypes instance.
+ *
+ * @param filepath
+ * is the mime-types definitions xml file.
+ * @return A MimeTypes instance for the specified filepath xml file.
+ */
+ public MimeTypes(String filepath) {
+ this(filepath, (Log) null);
+ }
+
+ /**
+ * Creates a new MimeTypes instance.
+ *
+ * @param is
+ * the document of the mime types definition file.
+ * @param logger
+ * is it Logger to uses for ouput messages.
+ */
+ public MimeTypes(Document doc, Log logger) {
+ if (logger == null) {
+ this.logger = LogFactory.getLog(this.getClass());
+ } else {
+ this.logger = logger;
+ }
+ MimeTypesReader reader = new MimeTypesReader(logger);
+ add(reader.read(doc));
+ }
+
+ /**
+ * Creates a new MimeTypes instance.
+ *
+ * @param is
+ * the document of the mime types definition file.
+ */
+ public MimeTypes(Document doc) {
+ this(doc, (Log) null);
+ }
+
+ /**
+ * Find the Mime Content Type of a file.
+ *
+ * @param file
+ * to analyze.
+ * @return the Mime Content Type of the specified file, or <code>null</code>
+ * if none is found.
+ */
+ public MimeType getMimeType(File file) {
+ return getMimeType(file.getName());
+ }
+
+ /**
+ * Find the Mime Content Type of a document from its URL.
+ *
+ * @param url
+ * of the document to analyze.
+ * @return the Mime Content Type of the specified document URL, or
+ * <code>null</code> if none is found.
+ */
+ public MimeType getMimeType(URL url) {
+ return getMimeType(url.getPath());
+ }
+
+ /**
+ * Find the Mime Content Type of a document from its name.
+ *
+ * @param name
+ * of the document to analyze.
+ * @return the Mime Content Type of the specified document name, or
+ * <code>null</code> if none is found.
+ */
+ public MimeType getMimeType(String name) {
+ MimeType type = patterns.matches(name);
+ if (type != null)
+ return type;
+ // if it's null here, then return the default type
+ return forName(DEFAULT);
+ }
+
+ /**
+ * Find the Mime Content Type of a stream from its content.
+ *
+ * @param data
+ * are the first bytes of data of the content to analyze.
+ * Depending on the length of provided data, all known MimeTypes
+ * are checked. If the length of provided data is greater or
+ * egals to the value returned by {@link #getMinLength()}, then
+ * all known MimeTypes are checked, otherwise only the MimeTypes
+ * that could be analyzed with the length of provided data are
+ * analyzed.
+ *
+ * @return The Mime Content Type found for the specified data, or
+ * <code>null</code> if none is found.
+ * @see #getMinLength()
+ */
+ public MimeType getMimeType(byte[] data) {
+ // Preliminary checks
+ if ((data == null) || (data.length < 1)) {
+ return null;
+ }
+
+ // First, check for XML descriptions (level by level)
+ for (int i = 0; i < xmls.size(); i++) {
+ MimeType type = ((MimeInfo) xmls.get(i)).getType();
+ if (type.matchesXML(data)) {
+ return type;
+ }
+ }
+
+ // Then, check for magic bytes
+ for (int i = 0; i < magics.size(); i++) {
+ Magic magic = (Magic) magics.get(i);
+ if (magic.eval(data)) {
+ return magic.getType();
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Find the Mime Content Type of a document from its name and its content.
+ * The policy used to guess the Mime Content Type is:
+ * <ol>
+ * <li>Try to find the type based on the provided data.</li>
+ * <li>If a type is found, then return it, otherwise try to find the type
+ * based on the file name</li>
+ * </ol>
+ *
+ * @param name
+ * of the document to analyze.
+ * @param data
+ * are the first bytes of the document's content.
+ * @return the Mime Content Type of the specified document, or
+ * <code>null</code> if none is found.
+ * @see #getMinLength()
+ */
+ public MimeType getMimeType(String name, byte[] data) {
+
+ // First, try to get the mime-type from the content
+ MimeType mimeType = getMimeType(data);
+
+ // If no mime-type found, then try to get the mime-type from
+ // the document name
+ if (mimeType == null) {
+ mimeType = getMimeType(name);
+ }
+ return mimeType;
+ }
+
+ /**
+ * Find a Mime Content Type from its name.
+ *
+ * @param name
+ * is the content type name
+ * @return the MimeType for the specified name, or <code>null</code> if no
+ * MimeType is registered for this name.
+ */
+ public MimeType forName(String name) {
+ MimeInfo info = (MimeInfo) types.get(name);
+ return (info == null) ? null : info.getType();
+ }
+
+ /**
+ * Return the minimum length of data to provide to analyzing methods based
+ * on the document's content in order to check all the known MimeTypes.
+ *
+ * @return the minimum length of data to provide.
+ * @see #getMimeType(byte[])
+ * @see #getMimeType(String, byte[])
+ */
+ public int getMinLength() {
+ return 1024;
+ // return minLength;
+ }
+
+ /**
+ * Add the specified mime-types in the repository.
+ *
+ * @param types
+ * are the mime-types to add.
+ */
+ void add(MimeType[] types) {
+ if (types == null) {
+ return;
+ }
+ for (int i = 0; i < types.length; i++) {
+ add(types[i]);
+ }
+ }
+
+ /**
+ * Add the specified mime-type in the repository.
+ *
+ * @param type
+ * is the mime-type to add.
+ */
+ void add(MimeType type) {
+
+ if (type == null) {
+ return;
+ }
+
+ // Add the new type in the repository
+ MimeInfo info = new MimeInfo(type);
+ types.put(info.getName(), info);
+
+ // Checks for some unsolved dependencies on this new type
+ List deps = (List) unsolvedDeps.get(info.getName());
+ if (deps != null) {
+ int level = info.getLevel();
+ for (int i = 0; i < deps.size(); i++) {
+ level = Math
+ .max(level, ((MimeInfo) deps.get(i)).getLevel() + 1);
+ }
+ info.setLevel(level);
+ unsolvedDeps.remove(info.getName());
+ }
+
+ // Checks if some of my super-types are not already solved
+ String[] superTypes = type.getSuperTypes();
+ for (int i = 0; i < superTypes.length; i++) {
+ MimeInfo superType = (MimeInfo) types.get(superTypes[i]);
+ if (superType == null) {
+ deps = (List) unsolvedDeps.get(superTypes[i]);
+ if (deps == null) {
+ deps = new ArrayList();
+ unsolvedDeps.put(superTypes[i], deps);
+ }
+ deps.add(info);
+ }
+ }
+
+ // Update minLentgth
+ minLength = Math.max(minLength, type.getMinLength());
+ // Update the extensions index...
+ patterns.add(type.getPatterns(), type);
+ // Update the magics index...
+ if (type.hasMagic()) {
+ Magic[] magics = type.getMagics();
+ for (int i = 0; i < magics.length; i++) {
+ this.magics.add(magics[i]);
+ }
+ }
+ Collections.sort(magics, MAGICS_COMPARATOR);
+
+ // Update the xml (xmlRoot) index...
+ if (type.hasRootXML()) {
+ this.xmls.add(info);
+ }
+ Collections.sort(xmls, LEVELS_COMPARATOR);
+ }
+
+ // Inherited Javadoc
+ public String toString() {
+ StringBuffer buf = new StringBuffer();
+ Iterator iter = types.values().iterator();
+ while (iter.hasNext()) {
+ MimeType type = ((MimeInfo) iter.next()).getType();
+ buf.append(type).append("\n");
+ }
+ return buf.toString();
+ }
+
+ private final class MimeInfo {
+
+ private MimeType type = null;
+
+ private int level = 0;
+
+ MimeInfo(MimeType type) {
+ this.type = type;
+ this.level = 0;
+ }
+
+ MimeType getType() {
+ return type;
+ }
+
+ int getLevel() {
+ return level;
+ }
+
+ void setLevel(int level) {
+ if (level <= this.level) {
+ return;
+ }
+
+ this.level = level;
+ // Update all my super-types
+ String[] supers = type.getSuperTypes();
+ for (int i = 0; i < supers.length; i++) {
+ MimeInfo sup = (MimeInfo) types.get(supers[i]);
+ if (sup != null) {
+ sup.setLevel(level + 1);
+ }
+ }
+ }
+
+ String getName() {
+ return type.getName();
+ }
+ }
+}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,383 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// Commons Logging imports
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+// DOM imports
+import org.w3c.dom.Attr;
+import org.w3c.dom.Node;
+import org.w3c.dom.Element;
+import org.w3c.dom.Document;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.NamedNodeMap;
+import org.xml.sax.InputSource;
+
+// JDK imports
+import java.io.InputStream;
+import java.util.ArrayList;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+/**
+ * A reader for XML files compliant with the freedesktop MIME-info DTD.
+ *
+ * <pre>
+ * <!DOCTYPE mime-info [
+ * <!ELEMENT mime-info (mime-type)+>
+ * <!ATTLIST mime-info xmlns CDATA #FIXED "http://www.freedesktop.org/standards/shared-mime-info">
+ *
+ * <!ELEMENT mime-type (comment|acronym|expanded-acronym|glob|magic|root-XML|alias|sub-class-of)*>
+ * <!ATTLIST mime-type type CDATA #REQUIRED>
+ *
+ * <!-- a comment describing a document with the respective MIME type. Example: "WMV video" -->
+ * <!ELEMENT comment (#PCDATA)>
+ * <!ATTLIST comment xml:lang CDATA #IMPLIED>
+ *
+ * <!-- a comment describing a the respective unexpanded MIME type acronym. Example: "WMV" -->
+ * <!ELEMENT acronym (#PCDATA)>
+ * <!ATTLIST acronym xml:lang CDATA #IMPLIED>
+ *
+ * <!-- a comment describing a the respective unexpanded MIME type acronym. Example: "Windows Media Video" -->
+ * <!ELEMENT expanded-acronym (#PCDATA)>
+ * <!ATTLIST expanded-acronym xml:lang CDATA #IMPLIED>
+ *
+ * <!ELEMENT glob EMPTY>
+ * <!ATTLIST glob pattern CDATA #REQUIRED>
+ *
+ * <!ELEMENT magic (match)+>
+ * <!ATTLIST magic priority CDATA #IMPLIED>
+ *
+ * <!ELEMENT match (match)*>
+ * <!ATTLIST match offset CDATA #REQUIRED>
+ * <!ATTLIST match type (string|big16|big32|little16|little32|host16|host32|byte) #REQUIRED>
+ * <!ATTLIST match value CDATA #REQUIRED>
+ * <!ATTLIST match mask CDATA #IMPLIED>
+ *
+ * <!ELEMENT root-XML EMPTY>
+ * <!ATTLIST root-XML
+ * namespaceURI CDATA #REQUIRED
+ * localName CDATA #REQUIRED>
+ *
+ * <!ELEMENT alias EMPTY>
+ * <!ATTLIST alias
+ * type CDATA #REQUIRED>
+ *
+ * <!ELEMENT sub-class-of EMPTY>
+ * <!ATTLIST sub-class-of
+ * type CDATA #REQUIRED>
+ * ]>
+ * </pre>
+ *
+ *
+ * @see http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec
+ * @author Jér&ocirs;me Charron
+ */
+final class MimeTypesReader {
+
+ /** The logger to use */
+ private Log logger = null;
+
+ MimeTypesReader() {
+ this(null);
+ }
+
+ MimeTypesReader(Log logger) {
+ if (logger == null) {
+ this.logger = LogFactory.getLog(this.getClass());
+ } else {
+ this.logger = logger;
+ }
+ }
+
+ MimeType[] read(String filepath) {
+ return read(MimeTypesReader.class.getClassLoader().getResourceAsStream(
+ filepath));
+ }
+
+ MimeType[] read(InputStream stream) {
+ MimeType[] types = null;
+ try {
+ DocumentBuilderFactory factory = DocumentBuilderFactory
+ .newInstance();
+ DocumentBuilder builder = factory.newDocumentBuilder();
+ Document document = builder.parse(new InputSource(stream));
+ types = read(document);
+ } catch (Exception e) {
+ if (logger.isWarnEnabled()) {
+ logger.warn(e.toString() + " while loading mime-types");
+ }
+ types = new MimeType[0];
+ }
+ return types;
+ }
+
+ MimeType[] read(Document document) {
+ // printDOM(document);
+ MimeType[] types = null;
+ Element element = document.getDocumentElement();
+ if ((element != null) && element.getTagName().equals("mime-info")) {
+ types = readMimeInfo(element);
+ }
+ return (types == null) ? (new MimeType[0]) : types;
+ }
+
+ /** Read Element named mime-info. */
+ private MimeType[] readMimeInfo(Element element) {
+ ArrayList types = new ArrayList();
+ NodeList nodes = element.getChildNodes();
+ for (int i = 0; i < nodes.getLength(); i++) {
+ Node node = nodes.item(i);
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ Element nodeElement = (Element) node;
+ if (nodeElement.getTagName().equals("mime-type")) {
+ MimeType type = readMimeType(nodeElement);
+ if (type != null) {
+ types.add(type);
+ }
+ }
+ }
+ }
+ return (MimeType[]) types.toArray(new MimeType[types.size()]);
+ }
+
+ /** Read Element named mime-type. */
+ private MimeType readMimeType(Element element) {
+
+ MimeType type = null;
+
+ try {
+ type = new MimeType(element.getAttribute("type"));
+ } catch (MimeTypeException mte) {
+ // Mime Type not valid... just ignore it
+ if (logger.isInfoEnabled()) {
+ logger.info(mte.toString() + " ... Ignoring!");
+ }
+ return null;
+ }
+
+ NodeList nodes = element.getChildNodes();
+ for (int i = 0; i < nodes.getLength(); i++) {
+ Node node = nodes.item(i);
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ Element nodeElement = (Element) node;
+ if (nodeElement.getTagName().equals("_comment")) {
+ type.setDescription(nodeElement.getFirstChild()
+ .getNodeValue());
+ } else if (nodeElement.getTagName().equals("glob")) {
+ readGlob(nodeElement, type);
+ } else if (nodeElement.getTagName().equals("magic")) {
+ readMagic(nodeElement, type);
+ } else if (nodeElement.getTagName().equals("alias")) {
+ readAlias(nodeElement, type);
+ } else if (nodeElement.getTagName().equals("root-XML")) {
+ readRootXML(nodeElement, type);
+ } else if (nodeElement.getTagName().equals("sub-class-of")) {
+ readSubClassOf(nodeElement, type);
+ }
+ }
+ }
+ return type;
+ }
+
+ /** Read Element named glob. */
+ private void readGlob(Element element, MimeType type) {
+ type.addPattern(element.getAttribute("pattern"));
+ }
+
+ /** Read Element named alias. */
+ private void readAlias(Element element, MimeType type) {
+ type.addAlias(element.getAttribute("type"));
+ }
+
+ /** Read Element named magic. */
+ private void readMagic(Element element, MimeType mimeType) {
+
+ Magic magic = null;
+ try {
+ magic = new Magic(Integer
+ .parseInt(element.getAttribute("priority")));
+ } catch (Exception e) {
+ magic = new Magic();
+ }
+ magic.setType(mimeType);
+ magic.setClause(readMatches(element));
+ mimeType.addMagic(magic);
+ }
+
+ private Clause readMatches(Element element) {
+ Clause sub = null;
+ Clause prev = Clause.FALSE;
+ Clause clause = null;
+ NodeList nodes = element.getChildNodes();
+ for (int i = 0; i < nodes.getLength(); i++) {
+ Node node = nodes.item(i);
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ Element nodeElement = (Element) node;
+ if (nodeElement.getTagName().equals("match")) {
+ sub = readMatches(nodeElement);
+ try {
+ if (sub != null) {
+ clause = new MagicClause(Operator.AND,
+ readMatch(nodeElement), sub);
+ } else {
+ clause = readMatch(nodeElement);
+ }
+ clause = new MagicClause(Operator.OR, prev, clause);
+ prev = clause;
+ } catch (MimeTypeException mte) {
+ logger.warn(mte + " while reading magic-match ["
+ + nodeElement + "], Ignoring!");
+ }
+ }
+ }
+ }
+ return clause;
+ }
+
+ /** Read Element named match. */
+ private MagicMatch readMatch(Element element) throws MimeTypeException {
+
+ String offset = null;
+ String value = null;
+ String mask = null;
+ String type = null;
+
+ NamedNodeMap attrs = element.getAttributes();
+ for (int i = 0; i < attrs.getLength(); i++) {
+ Attr attr = (Attr) attrs.item(i);
+ if (attr.getName().equals("offset")) {
+ offset = attr.getValue();
+ } else if (attr.getName().equals("type")) {
+ type = attr.getValue();
+ } else if (attr.getName().equals("value")) {
+ value = attr.getValue();
+ } else if (attr.getName().equals("mask")) {
+ mask = attr.getValue();
+ }
+ }
+ // Parse OffSet
+ String[] offsets = offset.split(":");
+ int offStart = 0;
+ int offEnd = 0;
+ try {
+ offStart = Integer.parseInt(offsets[0]);
+ } catch (Exception e) {
+ // WARN log + avoid loading
+ }
+ try {
+ offEnd = Integer.parseInt(offsets[1]);
+ } catch (Exception e) {
+ // WARN log
+ }
+ offEnd = Math.max(offStart, offEnd);
+
+ return new MagicMatch(offStart, offEnd, type, mask, value);
+ }
+
+ /** Read Element named root-XML. */
+ private void readRootXML(Element element, MimeType mimeType) {
+
+ mimeType.addRootXML(element.getAttribute("namespaceURI"), element
+ .getAttribute("localName"));
+ }
+
+ /** Read Element named sub-class-of. */
+ private void readSubClassOf(Element element, MimeType mimeType) {
+
+ mimeType.addSuperType(element.getAttribute("type"));
+ }
+
+ /** Prints the specified node, then prints all of its children. */
+ public static void printDOM(Node node) {
+ int type = node.getNodeType();
+ switch (type) {
+ // print the document element
+ case Node.DOCUMENT_NODE: {
+ System.out.println("<?xml version=\"1.0\" ?>");
+ printDOM(((Document) node).getDocumentElement());
+ break;
+ }
+
+ // print element with attributes
+ case Node.ELEMENT_NODE: {
+ System.out.print("<");
+ System.out.print(node.getNodeName());
+ NamedNodeMap attrs = node.getAttributes();
+ for (int i = 0; i < attrs.getLength(); i++) {
+ Node attr = attrs.item(i);
+ System.out.print(" " + attr.getNodeName().trim() + "=\""
+ + attr.getNodeValue().trim() + "\"");
+ }
+ System.out.println(">");
+
+ NodeList children = node.getChildNodes();
+ if (children != null) {
+ int len = children.getLength();
+ for (int i = 0; i < len; i++)
+ printDOM(children.item(i));
+ }
+
+ break;
+ }
+
+ // handle entity reference nodes
+ case Node.ENTITY_REFERENCE_NODE: {
+ System.out.print("&");
+ System.out.print(node.getNodeName().trim());
+ System.out.print(";");
+ break;
+ }
+
+ // print cdata sections
+ case Node.CDATA_SECTION_NODE: {
+ System.out.print("<![CDATA[");
+ System.out.print(node.getNodeValue().trim());
+ System.out.print("]]>");
+ break;
+ }
+
+ // print text
+ case Node.TEXT_NODE: {
+ System.out.print(node.getNodeValue().trim());
+ break;
+ }
+
+ // print processing instruction
+ case Node.PROCESSING_INSTRUCTION_NODE: {
+ System.out.print("<?");
+ System.out.print(node.getNodeName().trim());
+ String data = node.getNodeValue().trim();
+ {
+ System.out.print(" ");
+ System.out.print(data);
+ }
+ System.out.print("?>");
+ break;
+ }
+ }
+
+ if (type == Node.ELEMENT_NODE) {
+ System.out.println();
+ System.out.print("</");
+ System.out.print(node.getNodeName().trim());
+ System.out.print('>');
+ }
+ }
+
+}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,145 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// JDK imports
+import java.io.InputStream;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import org.w3c.dom.Document;
+import org.xml.sax.InputSource;
+
+// Tika imports
+import org.apache.tika.utils.Configurable;
+import org.apache.tika.utils.Configuration;
+import org.apache.tika.metadata.TikaMimeKeys;
+
+/**
+ *
+ * @author Jérôme Charron
+ */
+public class MimeUtils implements Configurable, TikaMimeKeys {
+
+ /** My logger */
+ private final static Logger LOG = Logger.getLogger(MimeUtils.class
+ .getName());
+
+ /** The key used to cache the mime repository in conf */
+ private final static String KEY = MimeUtils.class.getName();
+
+ /** My current configuration */
+ private Configuration conf = null;
+
+ /** A flag that tells if magic resolution must be performed */
+ private boolean magic = true;
+
+ /** The MimeTypes repository instance */
+ private MimeTypes repository = null;
+
+ /** Creates a new instance of MimeUtils */
+ public MimeUtils(Configuration conf) {
+ setConf(conf);
+ }
+
+ /***************************************************************************
+ * ----------------------------- <implementation:Configurable> *
+ * -----------------------------
+ */
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ this.magic = conf.getBoolean(MIME_TYPE_MAGIC, true);
+ this.repository = (MimeTypes) conf.getObject(KEY);
+ if (repository == null) {
+ repository = load(conf.get(TIKA_MIME_FILE));
+ conf.setObject(KEY, repository);
+ }
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /***************************************************************************
+ * ----------------------------- </implementation:Configurable> *
+ * -----------------------------
+ */
+
+ public final MimeTypes getRepository() {
+ return repository;
+ }
+
+ public String getType(String typeName, String url, byte[] data) {
+ MimeType type = null;
+ try {
+ typeName = MimeType.clean(typeName);
+ type = typeName == null ? null : repository.forName(typeName);
+ } catch (MimeTypeException mte) {
+ // Seems to be a malformed mime type name...
+ }
+
+ if (typeName == null || type == null || !type.matches(url)) {
+ // If no mime-type header, or cannot find a corresponding registered
+ // mime-type, or the one found doesn't match the url pattern
+ // it shouldbe, then guess a mime-type from the url pattern
+ type = repository.getMimeType(url);
+ typeName = type == null ? typeName : type.getName();
+ }
+ // if (typeName == null || type == null ||
+ // (this.magic && type.hasMagic() && !type.matches(data))) {
+ // If no mime-type already found, or the one found doesn't match
+ // the magic bytes it should be, then, guess a mime-type from the
+ // document content (magic bytes)
+ type = repository.getMimeType(data);
+ typeName = type == null ? typeName : type.getName();
+ // }
+ return typeName;
+ }
+
+ private final MimeTypes load(String tikaMimeFile) {
+ LOG.info("Loading [" + tikaMimeFile + "]");
+ Document document = getDocumentRoot(MimeUtils.class.getClassLoader()
+ .getResourceAsStream(tikaMimeFile));
+
+ MimeTypes types = new MimeTypes(document);
+ return types;
+ }
+
+ private final Document getDocumentRoot(InputStream is) {
+ // open up the XML file
+ DocumentBuilderFactory factory = null;
+ DocumentBuilder parser = null;
+ Document document = null;
+ InputSource inputSource = null;
+
+ inputSource = new InputSource(is);
+
+ try {
+ factory = DocumentBuilderFactory.newInstance();
+ parser = factory.newDocumentBuilder();
+ document = parser.parse(inputSource);
+ } catch (Exception e) {
+ LOG.log(Level.WARNING, "Unable to parse xml stream"
+ + ": Reason is [" + e + "]");
+ return null;
+ }
+
+ return document;
+ }
+
+}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/Operator.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/Operator.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/Operator.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/Operator.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+/**
+ * Defines a Boolean Binary Operator.
+ *
+ * @author Jerome Charron
+ */
+interface Operator {
+
+ /** The OR Boolean operator */
+ final static Operator OR = new Or();
+
+ /** The AND Boolean operator */
+ final static Operator AND = new And();
+
+ /**
+ * Evaluates the specified bolean operands.
+ *
+ * @param o1
+ * is the first boolean operand.
+ * @param o2
+ * is the second boolean operand.
+ * @return the value of this boolean operator applied on the specified
+ * boolean operands.
+ */
+ boolean eval(boolean o1, boolean o2);
+
+ /**
+ * Defines the Boolean Binary Operator AND.
+ */
+ final static class And implements Operator {
+ public boolean eval(boolean o1, boolean o2) {
+ return o1 && o2;
+ }
+
+ public String toString() {
+ return "AND";
+ }
+ }
+
+ /**
+ * Defines the Boolean Binary Operator OR.
+ */
+ final static class Or implements Operator {
+ public boolean eval(boolean o1, boolean o2) {
+ return o1 || o2;
+ }
+
+ public String toString() {
+ return "OR";
+ }
+ }
+}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,193 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// JDK imports
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+/**
+ * Defines a MimeType pattern.
+ *
+ * @author Jérôme Charron
+ */
+class Patterns {
+
+ private static Map escapeMap = new HashMap();
+ static {
+ escapeMap.put("\\", "\\\\");
+ escapeMap.put("?", "\\?");
+ escapeMap.put("[", "\\[");
+ escapeMap.put("]", "\\]");
+ escapeMap.put("^", "\\^");
+ escapeMap.put(".", "\\.");
+ escapeMap.put("-", "\\-");
+ escapeMap.put("$", "\\$");
+ escapeMap.put("+", "\\+");
+ escapeMap.put("(", "\\(");
+ escapeMap.put(")", "\\)");
+ escapeMap.put("{", "\\{");
+ escapeMap.put("}", "\\}");
+ escapeMap.put("|", "\\|");
+ escapeMap.put("*", ".*");
+ }
+
+ /** Gathers all the patterns */
+ private ArrayList patterns = new ArrayList();
+
+ /** An index of exact matching patterns */
+ private Map exactIdx = new HashMap();
+
+ /** An index of the patterns of the form "*.ext" */
+ private Map extIdx = new HashMap();
+
+ /** A list of other patterns */
+ private Map others = new HashMap();
+
+ /** Creates a new instance of Patterns */
+ Patterns() {
+ }
+
+ void add(String[] patterns, MimeType type) {
+ // Some preliminary checks
+ if ((patterns == null) || (type == null)) {
+ return;
+ }
+ // All is ok, so add the patterns
+ for (int i = 0; i < patterns.length; i++) {
+ add(patterns[i], type);
+ }
+ }
+
+ void add(String pattern, MimeType type) {
+ // Some preliminary checks
+ if ((pattern == null) || (type == null)) {
+ return;
+ }
+
+ // Add the pattern in the good index
+ if ((pattern.indexOf('*') == -1) && (pattern.indexOf('?') == -1)
+ && (pattern.indexOf('[') == -1)) {
+ exactIdx.put(pattern, type);
+
+ } else if (pattern.startsWith("*.")) {
+ extIdx.put(pattern.substring(2), type);
+
+ } else {
+ others.put(escape(pattern), type);
+ }
+ // Add the pattern in the list of patterns
+ patterns.add(pattern);
+ }
+
+ String[] getPatterns() {
+ return (String[]) patterns.toArray(new String[patterns.size()]);
+ }
+
+ /**
+ * Find the MimeType corresponding to a filename.
+ *
+ * It applies the recommandations detailed in FreeDesktop Shared MIME-info
+ * Database for guessing MimeType from a filename: It first try a
+ * case-sensitive match, then try again with the filename converted to
+ * lower-case if that fails. If several patterns match then the longest
+ * pattern is used. In particular, files with multiple extensions (such as
+ * Data.tar.gz) match the longest sequence of extensions (eg '*.tar.gz' in
+ * preference to '*.gz'). Literal patterns (eg, 'Makefile') are matched
+ * before all others. Patterns beginning with `*.' and containing no other
+ * special characters (`*?[') are matched before other wildcarded patterns
+ * (since this covers the majority of the patterns).
+ */
+ MimeType matches(String filename) {
+
+ // Preliminary check...
+ if (filename == null) {
+ return null;
+ }
+
+ // First, try exact match of the provided filename
+ MimeType type = (MimeType) exactIdx.get(filename);
+ if (type != null) {
+ return type;
+ }
+
+ // Then try exact match with only the filename
+ String str = last(filename, '/');
+ if (str != null) {
+ type = (MimeType) exactIdx.get(str);
+ if (type != null) {
+ return type;
+ }
+ }
+ str = last(filename, '\\');
+ if (str != null) {
+ type = (MimeType) exactIdx.get(str);
+ if (type != null) {
+ return type;
+ }
+ }
+
+ // Then try "extension" (*.xxx) matching
+ int idx = filename.indexOf('.', 0);
+ while (idx != -1) {
+ type = (MimeType) extIdx.get(filename.substring(idx + 1));
+ if (type != null) {
+ return type;
+ }
+ idx = filename.indexOf('.', idx + 1);
+ }
+
+ // And finally, try complex regexp matching
+ String longest = null;
+ Iterator iter = others.keySet().iterator();
+ while (iter.hasNext()) {
+ String pattern = (String) iter.next();
+ if ((filename.matches(pattern))
+ && (pattern.length() > longest.length())) {
+ longest = pattern;
+ }
+ }
+ if (longest != null) {
+ type = (MimeType) others.get(longest);
+ }
+ return type;
+ }
+
+ private final static String last(String str, char c) {
+ if (str == null) {
+ return null;
+ }
+ int idx = str.lastIndexOf(c);
+ if ((idx < 0) || (idx >= (str.length() - 1))) {
+ return null;
+ }
+ return str.substring(idx + 1);
+ }
+
+ private final static String escape(String str) {
+ char[] chars = str.toCharArray();
+ StringBuffer result = new StringBuffer(str.length());
+ for (int i = 0; i < str.length(); i++) {
+ String charAt = String.valueOf(str.charAt(i));
+ String replace = (String) escapeMap.get(charAt);
+ result.append((replace != null) ? replace : charAt);
+ }
+ return result.toString();
+ }
+
+}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configurable.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configurable.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configurable.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configurable.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.utils;
+
+/**
+ * @author mattmann
+ * @version $Revision$
+ *
+ * <p>
+ * An interface allowing a Tika object to be <code>Configured</code> by a
+ * {@link Configuration} object. Based on Apache Hadoop's configuration
+ * interface.
+ * </p>.
+ */
+public interface Configurable {
+
+ /**
+ * Configures the Tika object with the provided {@link Configuration} named
+ * <code>conf</code>.
+ *
+ * @param conf
+ * The object's new {@link Configuration}.
+ */
+ public void setConf(Configuration conf);
+
+ /**
+ *
+ * @return The Tika object's existing {@link Configuration}.
+ */
+ public Configuration getConf();
+
+}
Added: incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configuration.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configuration.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configuration.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configuration.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,176 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+// JDK imports
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+import java.util.StringTokenizer;
+
+/**
+ * Provides access to configuration parameters.
+ */
+public class Configuration {
+
+ private Properties properties;
+
+ /** A new configuration. */
+ public Configuration() {
+ this.properties = new Properties();
+ }
+
+ /** A new configuration with the same settings cloned from another. */
+ public Configuration(Properties properties) {
+ if (properties != null) {
+ this.properties = (Properties) properties.clone();
+ } else {
+ this.properties = new Properties();
+ }
+ }
+
+ /**
+ * Returns the value of the <code>name</code> property, or null if no such
+ * property exists.
+ */
+ public Object getObject(String name) {
+ return properties.get(name);
+ }
+
+ /** Sets the value of the <code>name</code> property. */
+ public void setObject(String name, Object value) {
+ properties.put(name, value);
+ }
+
+ /**
+ * Returns the value of the <code>name</code> property. If no such
+ * property exists, then <code>defaultValue</code> is returned.
+ */
+ public Object get(String name, Object defaultValue) {
+ Object res = getObject(name);
+ return (res != null) ? res : defaultValue;
+ }
+
+ /**
+ * Returns the value of the <code>name</code> property, or null if no such
+ * property exists.
+ */
+ public String get(String name) {
+ return properties.getProperty(name);
+ }
+
+ /** Sets the value of the <code>name</code> property. */
+ public void set(String name, Object value) {
+ properties.setProperty(name, value.toString());
+ }
+
+ /**
+ * Returns the value of the <code>name</code> property. If no such
+ * property exists, then <code>defaultValue</code> is returned.
+ */
+ public String get(String name, String defaultValue) {
+ return properties.getProperty(name, defaultValue);
+ }
+
+ /**
+ * Returns the value of the <code>name</code> property as an integer. If
+ * no such property is specified, or if the specified value is not a valid
+ * integer, then <code>defaultValue</code> is returned.
+ */
+ public int getInt(String name, int defaultValue) {
+ try {
+ return Integer.parseInt(get(name));
+ } catch (Exception e) {
+ return defaultValue;
+ }
+ }
+
+ /** Sets the value of the <code>name</code> property to an integer. */
+ public void setInt(String name, int value) {
+ set(name, Integer.toString(value));
+ }
+
+ /**
+ * Returns the value of the <code>name</code> property as a long. If no
+ * such property is specified, or if the specified value is not a valid
+ * long, then <code>defaultValue</code> is returned.
+ */
+ public long getLong(String name, long defaultValue) {
+ try {
+ return Long.parseLong(get(name));
+ } catch (Exception e) {
+ return defaultValue;
+ }
+ }
+
+ /** Sets the value of the <code>name</code> property to a long. */
+ public void setLong(String name, long value) {
+ set(name, Long.toString(value));
+ }
+
+ /**
+ * Returns the value of the <code>name</code> property as a float. If no
+ * such property is specified, or if the specified value is not a valid
+ * float, then <code>defaultValue</code> is returned.
+ */
+ public float getFloat(String name, float defaultValue) {
+ try {
+ return Float.parseFloat(get(name));
+ } catch (Exception e) {
+ return defaultValue;
+ }
+ }
+
+ /**
+ * Returns the value of the <code>name</code> property as an boolean. If
+ * no such property is specified, or if the specified value is not a valid
+ * boolean, then <code>defaultValue</code> is returned. Valid boolean
+ * values are "true" and "false".
+ */
+ public boolean getBoolean(String name, boolean defaultValue) {
+ String valueString = get(name);
+ if ("true".equals(valueString)) {
+ return true;
+ } else if ("false".equals(valueString)) {
+ return false;
+ } else {
+ return defaultValue;
+ }
+ }
+
+ /** Sets the value of the <code>name</code> property to an integer. */
+ public void setBoolean(String name, boolean value) {
+ set(name, Boolean.toString(value));
+ }
+
+ /**
+ * Returns the value of the <code>name</code> property as an array of
+ * strings. If no such property is specified, then <code>null</code> is
+ * returned. Values are comma delimited.
+ */
+ public String[] getStrings(String name) {
+ String valueString = get(name);
+ if (valueString == null)
+ return null;
+ StringTokenizer tokenizer = new StringTokenizer(valueString, ",");
+ List values = new ArrayList();
+ while (tokenizer.hasMoreTokens()) {
+ values.add(tokenizer.nextToken());
+ }
+ return (String[]) values.toArray(new String[values.size()]);
+ }
+
+}