You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2007/09/21 17:07:59 UTC

svn commit: r578161 [1/2] - in /incubator/tika/trunk: ./ src/main/java/org/apache/tika/metadata/ src/main/java/org/apache/tika/mime/ src/main/java/org/apache/tika/utils/ src/main/resources/mime/ src/test/java/org/apache/tika/mime/

Author: mattmann
Date: Fri Sep 21 08:07:58 2007
New Revision: 578161

URL: http://svn.apache.org/viewvc?rev=578161&view=rev
Log:
- patch for TIKA-6

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/Clause.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/HexCoDec.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/Magic.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicClause.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicMatch.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypeException.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/Operator.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java
    incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configurable.java
    incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configuration.java
    incubator/tika/trunk/src/main/java/org/apache/tika/utils/StringUtil.java
    incubator/tika/trunk/src/main/resources/mime/
    incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
    incubator/tika/trunk/src/test/java/org/apache/tika/mime/
    incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeUtils.java
Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/pom.xml
    incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=578161&r1=578160&r2=578161&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Fri Sep 21 08:07:58 2007
@@ -28,3 +28,6 @@
 12. TIKA-18 - "Office" interface should be renamed "MSOffice" (mattmann)
 
 13. TIKA-23 - Decouple Parser from ParserConfig (jukka)
+
+14. TIKA-6 - Port Nutch (or better) MimeType detection system into Tika (J. Charron & mattmann)
+

Modified: incubator/tika/trunk/pom.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/pom.xml?rev=578161&r1=578160&r2=578161&view=diff
==============================================================================
--- incubator/tika/trunk/pom.xml (original)
+++ incubator/tika/trunk/pom.xml Fri Sep 21 08:07:58 2007
@@ -149,6 +149,16 @@
       <optional/>
     </dependency>
     <dependency>
+      <groupId>commons-logging</groupId>
+      <artifactId>commons-logging</artifactId>
+      <version>1.0.4</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+      <version>1.3</version>
+    </dependency>
+    <dependency>
       <groupId>pdfbox</groupId>
       <artifactId>pdfbox</artifactId>
       <version>0.7.3</version>
@@ -217,6 +227,11 @@
                     <include name="README.txt"/>
                     <include name="NOTICE.txt"/>
                     <include name="LICENSE.txt"/>
+                  </fileset>
+                </copy>
+                <copy todir="${project.build.outputDirectory}/org/apache/tika/mime">
+                  <fileset dir="${basedir}/src/main/resources/mime">
+                    <include name="*"/>
                   </fileset>
                 </copy>
               </tasks>

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java?rev=578161&r1=578160&r2=578161&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/metadata/Metadata.java Fri Sep 21 08:07:58 2007
@@ -30,7 +30,7 @@
  * 
  */
 public class Metadata implements CreativeCommons, DublinCore, HttpHeaders,
-    MSOffice {
+    MSOffice, TikaMimeKeys {
 
   /**
    * A map of all metadata attributes.

Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/Clause.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/Clause.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/Clause.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/Clause.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,70 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+/**
+ * Defines a clause to be evaluated.
+ * 
+ * @author Jerome Charron
+ */
+interface Clause {
+
+    /** A clause that is always true. */
+    final static Clause TRUE = new True();
+
+    /** A clause that is always false. */
+    final static Clause FALSE = new False();
+
+    /**
+     * Evaluates this clause with the specified chunk of data.
+     */
+    public boolean eval(byte[] data);
+
+    /**
+     * Returns the size of this clause. The size of a clause is the number of
+     * chars it is composed of.
+     */
+    public int size();
+
+    final static class False implements Clause {
+        public boolean eval(byte[] data) {
+            return false;
+        }
+
+        public int size() {
+            return 0;
+        }
+
+        public String toString() {
+            return "FALSE";
+        }
+    }
+
+    final static class True implements Clause {
+        public boolean eval(byte[] data) {
+            return true;
+        }
+
+        public int size() {
+            return 0;
+        }
+
+        public String toString() {
+            return "TRUE";
+        }
+    }
+
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/HexCoDec.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/HexCoDec.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/HexCoDec.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/HexCoDec.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,117 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+/**
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class HexCoDec {
+
+    private final static char[] HEX_CHARS = { '0', '1', '2', '3', '4', '5',
+            '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
+
+    /**
+     * Decode a hex string
+     * 
+     * @param hexValue
+     *            the string of hex characters
+     * @return the decode hex string as bytes.
+     */
+    public static byte[] decode(String hexValue) {
+        return decode(hexValue.toCharArray());
+    }
+
+    /**
+     * Decode an array of hex chars
+     * 
+     * @param hexChars
+     *            an array of hex characters.
+     * @return the decode hex chars as bytes.
+     */
+    public static byte[] decode(char[] hexChars) {
+        return decode(hexChars, 0, hexChars.length);
+    }
+
+    /**
+     * Decode an array of hex chars.
+     * 
+     * @param hexChars
+     *            an array of hex characters.
+     * @param starIndex
+     *            the index of the first character to decode
+     * @param length
+     *            the number of characters to decode.
+     * @return the decode hex chars as bytes.
+     */
+    public static byte[] decode(char[] hexChars, int startIndex, int length) {
+        if ((length & 1) != 0)
+            throw new IllegalArgumentException("Length must be even");
+
+        byte[] result = new byte[length / 2];
+        for (int j = 0; j < result.length; j++) {
+            result[j] = (byte) (hexCharToNibble(hexChars[startIndex++]) * 16 + hexCharToNibble(hexChars[startIndex++]));
+        }
+        return result;
+    }
+
+    /**
+     * Hex encode an array of bytes
+     * 
+     * @param bites
+     *            the array of bytes to encode.
+     * @return the array of hex characters.
+     */
+    public static char[] encode(byte[] bites) {
+        return encode(bites, 0, bites.length);
+    }
+
+    /**
+     * Hex encode an array of bytes
+     * 
+     * @param bites
+     *            the array of bytes to encode.
+     * @param starIndex
+     *            the index of the first character to encode.
+     * @param length
+     *            the number of characters to encode.
+     * @return the array of hex characters.
+     */
+    public static char[] encode(byte[] bites, int startIndex, int length) {
+        char[] result = new char[length * 2];
+        for (int i = 0, j = 0; i < length; i++) {
+            int bite = bites[startIndex++] & 0xff;
+            result[j++] = HEX_CHARS[bite >> 4];
+            result[j++] = HEX_CHARS[bite & 0xf];
+        }
+        return result;
+    }
+
+    /**
+     * Internal method to turn a hex char into a nibble.
+     */
+    private static int hexCharToNibble(char ch) {
+        if ((ch >= '0') && (ch <= '9'))
+            return ch - '0';
+        else if ((ch >= 'a') && (ch <= 'f'))
+            return ch - 'a' + 10;
+        else if ((ch >= 'A') && (ch <= 'F'))
+            return ch - 'A' + 10;
+        else
+            throw new IllegalArgumentException("Not a hex char - '" + ch + "'");
+    }
+
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/Magic.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/Magic.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/Magic.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/Magic.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,69 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+/**
+ * Defines a magic for a MimeType. A magic is made of one or several
+ * MagicClause.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+class Magic implements Clause {
+
+    private MimeType type = null;
+
+    private int priority = 50;
+
+    private Clause clause = null;
+
+    Magic() {
+        this(50);
+    }
+
+    Magic(int priority) {
+        this.priority = priority;
+    }
+
+    void setType(MimeType type) {
+        this.type = type;
+    }
+
+    MimeType getType() {
+        return type;
+    }
+
+    int getPriority() {
+        return priority;
+    }
+
+    void setClause(Clause clause) {
+        this.clause = clause;
+    }
+
+    public boolean eval(byte[] data) {
+        return clause.eval(data);
+    }
+
+    public int size() {
+        return clause.size();
+    }
+
+    public String toString() {
+        StringBuffer buf = new StringBuffer();
+        buf.append("[").append(priority).append("/").append(clause).append("]");
+        return buf.toString();
+    }
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicClause.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicClause.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicClause.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicClause.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+/**
+ * Defines a MagicClause.
+ * 
+ * @author J&eacute;r&ocir;me Charron
+ */
+class MagicClause implements Clause {
+
+    private Operator op = null;
+
+    private Clause c1 = null;
+
+    private Clause c2 = null;
+
+    private int size = 0;
+
+    MagicClause(Operator op, Clause c1, Clause c2) {
+        this.op = op;
+        this.c1 = c1;
+        this.c2 = c2;
+        this.size = c1.size() + c2.size();
+    }
+
+    public boolean eval(byte[] data) {
+        return op.eval(c1.eval(data), c2.eval(data));
+    }
+
+    public int size() {
+        return size;
+    }
+
+    public String toString() {
+        return new StringBuffer().append("(").append(c1).append(" ").append(op)
+                .append(" ").append(c2).append(")").toString();
+    }
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicMatch.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicMatch.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicMatch.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MagicMatch.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,216 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// JDK imports
+import java.io.ByteArrayOutputStream;
+import java.math.BigInteger;
+
+// Jakarta Commons Codec imports
+import org.apache.commons.codec.DecoderException;
+import org.apache.commons.codec.binary.Hex;
+
+/**
+ * Defines a magic match.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+class MagicMatch implements Clause {
+
+    private final static Hex HEX_CODEC = new Hex();
+
+    private int offsetStart;
+
+    private int offsetEnd;
+
+    private String type;
+
+    private BigInteger mask;
+
+    private BigInteger value;
+
+    private int length;
+
+    MagicMatch(int offsetStart, int offsetEnd, String type, String mask,
+            String value) throws MimeTypeException {
+
+        this.offsetStart = offsetStart;
+        this.offsetEnd = offsetEnd;
+        this.type = type;
+        try {
+            byte[] decoded = decodeValue(type, value);
+            this.length = decoded.length;
+            this.value = new BigInteger(decoded);
+            if (mask != null) {
+                this.mask = new BigInteger(decodeValue(type, mask));
+                this.value = this.value.and(this.mask);
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+            throw new MimeTypeException(e);
+        }
+    }
+
+    private byte[] decodeValue(String type, String value)
+            throws DecoderException {
+
+        // Preliminary check
+        if ((value == null) || (type == null)) {
+            return null;
+        }
+
+        byte[] decoded = null;
+        String tmpVal = null;
+        int radix = 8;
+
+        // hex
+        if (value.startsWith("0x")) {
+            tmpVal = value.substring(2);
+            radix = 16;
+        } else {
+            tmpVal = value;
+            radix = 8;
+        }
+
+        if (type.equals("string")) {
+            decoded = decodeString(value);
+
+        } else if (type.equals("byte")) {
+            decoded = tmpVal.getBytes();
+
+        } else if (type.equals("host16") || type.equals("little16")) {
+            int i = Integer.parseInt(tmpVal, radix);
+            decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
+
+        } else if (type.equals("big16")) {
+            int i = Integer.parseInt(tmpVal, radix);
+            decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
+
+        } else if (type.equals("host32") || type.equals("little32")) {
+            long i = Long.parseLong(tmpVal, radix);
+            decoded = new byte[] { (byte) ((i & 0x000000FF)),
+                    (byte) ((i & 0x0000FF00) >> 8),
+                    (byte) ((i & 0x00FF0000) >> 16),
+                    (byte) ((i & 0xFF000000) >> 24) };
+
+        } else if (type.equals("big32")) {
+            long i = Long.parseLong(tmpVal, radix);
+            decoded = new byte[] { (byte) ((i & 0xFF000000) >> 24),
+                    (byte) ((i & 0x00FF0000) >> 16),
+                    (byte) ((i & 0x0000FF00) >> 8), (byte) ((i & 0x000000FF)) };
+        }
+        return decoded;
+    }
+
+    private byte[] decodeString(String value) throws DecoderException {
+
+        if (value.startsWith("0x")) {
+            return HEX_CODEC.decode(value.substring(2).getBytes());
+        }
+
+        try {
+            ByteArrayOutputStream decoded = new ByteArrayOutputStream();
+
+            for (int i = 0; i < value.length(); i++) {
+                if (value.charAt(i) == '\\') {
+                    if (value.charAt(i + 1) == '\\') {
+                        decoded.write('\\');
+                        i++;
+                    } else if (value.charAt(i + 1) == 'x') {
+                        decoded.write(HEX_CODEC.decode(value.substring(i + 2,
+                                i + 4).getBytes()));
+                        i += 3;
+                    } else {
+                        int j = i + 1;
+                        while ((j < i + 4) && (j < value.length())
+                                && (Character.isDigit(value.charAt(j)))) {
+                            j++;
+                        }
+                        decoded.write(Short.decode(
+                                "0" + value.substring(i + 1, j)).byteValue());
+                        i = j - 1;
+                    }
+                } else {
+                    decoded.write(value.charAt(i));
+                }
+            }
+            return decoded.toByteArray();
+        } catch (Exception e) {
+            throw new DecoderException(e.toString() + " for " + value);
+        }
+    }
+
+    public boolean eval(byte[] data) {
+
+        boolean ok = false;
+        for (int i = offsetStart; i <= offsetEnd; i++) {
+            if (data.length < (this.length + i)) {
+                // Not enough data...
+                return false;
+            }
+            byte[] array = new byte[this.length];
+            System.arraycopy(data, i, array, 0, this.length);
+            BigInteger content = new BigInteger(array);
+            // System.out.println("Evaluating " + content);
+            if (mask != null) {
+                content = content.and(mask);
+            }
+            if (value.equals(content)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    public int size() {
+        return length;
+    }
+
+    public String toString() {
+        return new StringBuffer().append("[").append(offsetStart).append(":")
+                .append(offsetEnd).append("(").append(type).append(")").append(
+                        "-").append(mask).append("#").append(value).append("]")
+                .toString();
+    }
+
+    private final static boolean equals(byte[] b1, byte[] b2) {
+        if ((b1 != null) && (b2 != null)) {
+            if (b1.length != b2.length) {
+                return false;
+            }
+            for (int i = 0; i < b1.length; i++) {
+                if (b1[i] != b2[i]) {
+                    return false;
+                }
+            }
+            return true;
+        }
+        if ((b1 == null) && (b2 == null)) {
+            return true;
+        }
+        return false;
+    }
+
+    private final static String toHexString(byte[] bytes) {
+        StringBuffer buf = new StringBuffer();
+        for (int i = 0; i < bytes.length; i++) {
+            String str = Integer.toHexString(bytes[i]);
+            buf.append((str.length() > 2) ? str.substring(str.length() - 2)
+                    : str);
+        }
+        return buf.toString();
+    }
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,470 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// JDK imports
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.regex.Pattern;
+import org.apache.tika.utils.StringUtil;
+
+/**
+ * Defines a Mime Content Type.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ * @author Hari Kodungallur
+ */
+public final class MimeType {
+
+    /** The primary and sub types separator */
+    private final static String SEPARATOR = "/";
+
+    /** The parameters separator */
+    private final static String PARAMS_SEP = ";";
+
+    /** Special characters not allowed in content types. */
+    private final static String SPECIALS = "()<>@,;:\\\"/[]?=";
+
+    /** The Mime-Type full name */
+    private String name = null;
+
+    /** The Mime-Type primary type */
+    private String primary = null;
+
+    /** The Mime-Type sub type */
+    private String sub = null;
+
+    /** The Mime-Type description */
+    private String description = null;
+
+    /** The Mime-Type associated recognition patterns */
+    private Patterns patterns = null;
+
+    /** The magics associated to this Mime-Type */
+    private ArrayList magics = null;
+
+    /** The aliases Mime-Types for this one */
+    private ArrayList aliases = null;
+
+    /** The root-XML associated to this Mime-Type */
+    private ArrayList rootXML = null;
+
+    /** The sub-class-of associated to this Mime-Type */
+    private ArrayList superTypes = null;
+
+    /** The mime-type level (regarding its subTypes) */
+    private int level = 0;
+
+    /** The minimum length of data to provides for magic analyzis */
+    private int minLength = 0;
+
+    /**
+     * Creates a MimeType from a String.
+     * 
+     * @param name
+     *            the MIME content type String.
+     */
+    public MimeType(String name) throws MimeTypeException {
+
+        if (name == null || name.length() <= 0) {
+            throw new MimeTypeException("The type can not be null or empty");
+        }
+
+        // Split the two parts of the Mime Content Type
+        String[] parts = name.split(SEPARATOR, 2);
+
+        // Checks validity of the parts
+        if (parts.length != 2) {
+            throw new MimeTypeException("Invalid Content Type " + name);
+        }
+        init(parts[0], parts[1]);
+    }
+
+    /**
+     * Creates a MimeType with the given primary type and sub type.
+     * 
+     * @param primary
+     *            the content type primary type.
+     * @param sub
+     *            the content type sub type.
+     */
+    public MimeType(String primary, String sub) throws MimeTypeException {
+        init(primary, sub);
+    }
+
+    /** Init method used by constructors. */
+    private void init(String primary, String sub) throws MimeTypeException {
+
+        // Preliminary checks...
+        if ((primary == null) || (primary.length() <= 0) || (!isValid(primary))) {
+            throw new MimeTypeException("Invalid Primary Type " + primary);
+        }
+        // Remove optional parameters from the sub type
+        String clearedSub = null;
+        if (sub != null) {
+            clearedSub = sub.split(PARAMS_SEP)[0];
+        }
+        if ((clearedSub == null) || (clearedSub.length() <= 0)
+                || (!isValid(clearedSub))) {
+            throw new MimeTypeException("Invalid Sub Type " + clearedSub);
+        }
+
+        // All is ok, assign values
+        this.primary = primary.toLowerCase().trim();
+        this.sub = clearedSub.toLowerCase().trim();
+        this.name = this.primary + SEPARATOR + this.sub;
+        this.patterns = new Patterns();
+        this.magics = new ArrayList();
+        this.aliases = new ArrayList();
+        this.rootXML = new ArrayList();
+        this.superTypes = new ArrayList();
+    }
+
+    /**
+     * Cleans a content-type. This method cleans a content-type by removing its
+     * optional parameters and returning only its
+     * <code>primary-type/sub-type</code>.
+     * 
+     * @param type
+     *            is the content-type to clean.
+     * @return the cleaned version of the specified content-type.
+     * @throws MimeTypeException
+     *             if something wrong occurs during the parsing/cleaning of the
+     *             specified type.
+     */
+    public final static String clean(String type) throws MimeTypeException {
+        return (new MimeType(type)).getName();
+    }
+
+    /**
+     * Return the name of this mime-type.
+     * 
+     * @return the name of this mime-type.
+     */
+    public String getName() {
+        return name;
+    }
+
+    /**
+     * Return the primary type of this mime-type.
+     * 
+     * @return the primary type of this mime-type.
+     */
+    public String getPrimaryType() {
+        return primary;
+    }
+
+    /**
+     * Return the sub type of this mime-type.
+     * 
+     * @return the sub type of this mime-type.
+     */
+    public String getSubType() {
+        return sub;
+    }
+
+    // Inherited Javadoc
+    public String toString() {
+        StringBuffer buf = new StringBuffer();
+        buf.append(name).append(" -- ").append(getDescription()).append("\n")
+                .append("Aliases: ");
+        if (aliases.size() < 1) {
+            buf.append(" NONE");
+        }
+        buf.append("\n");
+        for (int i = 0; i < aliases.size(); i++) {
+            buf.append("\t").append((String) aliases.get(i)).append("\n");
+        }
+        buf.append("Patterns:");
+        String[] patterns = this.patterns.getPatterns();
+        if (patterns.length < 1) {
+            buf.append(" NONE");
+        }
+        buf.append("\n");
+        for (int i = 0; i < patterns.length; i++) {
+            buf.append("\t").append(patterns[i]).append("\n");
+        }
+        buf.append("Magics:  ");
+        if (magics.size() < 1) {
+            buf.append(" NONE");
+        }
+        buf.append("\n");
+        for (int i = 0; i < magics.size(); i++) {
+            buf.append("\t").append((Magic) magics.get(i)).append("\n");
+        }
+
+        return buf.toString();
+    }
+
+    /**
+     * Indicates if an object is equal to this mime-type. The specified object
+     * is equal to this mime-type if it is not null, and it is an instance of
+     * MimeType and its name is equals to this mime-type.
+     * 
+     * @param object
+     *            the reference object with which to compare.
+     * @return <code>true</code> if this mime-type is equal to the object
+     *         argument; <code>false</code> otherwise.
+     */
+    public boolean equals(Object object) {
+        try {
+            return ((MimeType) object).getName().equals(this.name);
+        } catch (Exception e) {
+            return false;
+        }
+    }
+
+    // Inherited Javadoc
+    public int hashCode() {
+        return name.hashCode();
+    }
+
+    /**
+     * Return the description of this mime-type.
+     * 
+     * @return the description of this mime-type.
+     */
+    public String getDescription() {
+        return description;
+    }
+
+    /**
+     * Set the description of this mime-type.
+     * 
+     * @param description
+     *            the description of this mime-type.
+     */
+    void setDescription(String description) {
+        this.description = description;
+    }
+
+    /**
+     * Add a supported file-naming pattern.
+     * 
+     * @param pattern
+     *            to add to the list of recognition pattern for this mime-type.
+     */
+    void addPattern(String pattern) {
+        patterns.add(pattern, this);
+    }
+
+    /**
+     * Return the recogition patterns for this mime-type
+     * 
+     * @return the recoginition patterns associated to this mime-type.
+     */
+    String[] getPatterns() {
+        return patterns.getPatterns();
+    }
+
+    /**
+     * Add an alias to this mime-type
+     * 
+     * @param alias
+     *            to add to this mime-type.
+     */
+    void addAlias(String alias) {
+        aliases.add(alias);
+    }
+
+    /**
+     * Add some rootXML info to this mime-type
+     * 
+     * @param namespaceURI
+     * @param localName
+     */
+    void addRootXML(String namespaceURI, String localName) {
+        rootXML.add(new RootXML(this, namespaceURI, localName));
+    }
+
+    boolean matchesXML(byte[] data) {
+        RootXML xml = null;
+        String content = new String(data);
+        for (int i = 0; i < rootXML.size(); i++) {
+            xml = (RootXML) rootXML.get(i);
+            if (xml.matches(content)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    boolean hasRootXML() {
+        return (rootXML.size() > 0);
+    }
+
+    RootXML[] getRootXMLs() {
+        return (RootXML[]) rootXML.toArray(new RootXML[rootXML.size()]);
+    }
+
+    void addSuperType(String type) {
+        superTypes.add(type);
+    }
+
+    boolean hasSuperType() {
+        return (superTypes.size() > 0);
+    }
+
+    /**
+     * Returns the super types of this mime-type. A type is a super type of
+     * another type if any instance of the second type is also an instance of
+     * the first.
+     */
+    public String[] getSuperTypes() {
+        return (String[]) superTypes.toArray(new String[superTypes.size()]);
+    }
+
+    int getLevel() {
+        return level;
+    }
+
+    void incLevel() {
+        this.level++;
+    }
+
+    /**
+     * Return the recogition patterns for this mime-type
+     * 
+     * @return the recoginition patterns associated to this mime-type.
+     */
+    public String[] getAliases() {
+        return (String[]) aliases.toArray(new String[aliases.size()]);
+    }
+
+    Magic[] getMagics() {
+        return (Magic[]) magics.toArray(new Magic[magics.size()]);
+    }
+
+    void addMagic(Magic magic) {
+        if (magic == null) {
+            return;
+        }
+        magics.add(magic);
+        Collections.sort(magics, MimeTypes.MAGICS_COMPARATOR);
+    }
+
+    int getMinLength() {
+        return minLength;
+    }
+
+    public boolean hasMagic() {
+        return (magics.size() > 0);
+    }
+
+    public boolean matches(String url) {
+        return (patterns.matches(url) == this);
+    }
+
+    public boolean matchesMagic(byte[] data) {
+        for (int i = 0; i < magics.size(); i++) {
+            Magic magic = (Magic) magics.get(i);
+            if (magic.eval(data)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    public boolean matches(byte[] data) {
+        return matchesXML(data) || matchesMagic(data);
+    }
+
+    /** Checks if the specified primary or sub type is valid. */
+    private boolean isValid(String type) {
+        return (type != null) && (type.trim().length() > 0)
+                && !hasCtrlOrSpecials(type);
+    }
+
+    /** Checks if the specified string contains some special characters. */
+    private boolean hasCtrlOrSpecials(String type) {
+        int len = type.length();
+        int i = 0;
+        while (i < len) {
+            char c = type.charAt(i);
+            if (c <= '\032' || SPECIALS.indexOf(c) > 0) {
+                return true;
+            }
+            i++;
+        }
+        return false;
+    }
+
+    /**
+     * Defines a RootXML description. RootXML is made of a localName and/or a
+     * namespaceURI.
+     */
+    class RootXML {
+
+        private final static int PATTERN_FLAGS = Pattern.CASE_INSENSITIVE
+                | Pattern.DOTALL | Pattern.MULTILINE;
+
+        private MimeType type = null;
+
+        private String namespaceURI = null;
+
+        private String localName = null;
+
+        private Pattern pattern = null;
+
+        RootXML(MimeType type, String namespaceURI, String localName) {
+            this.type = type;
+            this.namespaceURI = namespaceURI;
+            this.localName = localName;
+            if ((StringUtil.isEmpty(namespaceURI))
+                    && (StringUtil.isEmpty(localName))) {
+                throw new IllegalArgumentException(
+                        "Both namespaceURI and localName cannot be null");
+            }
+            String regex = null;
+            if (StringUtil.isEmpty(namespaceURI)) {
+                regex = ".*<" + localName + "[^<>]*>.*";
+            } else if (StringUtil.isEmpty(localName)) {
+                regex = ".*<[^<>]*\\p{Space}xmlns=[\"\']?" + namespaceURI
+                        + "[\"\']?[^<>]*>.*";
+            } else {
+                regex = ".*<" + localName + "[^<>]*\\p{Space}xmlns=[\"\']?"
+                        + namespaceURI + "[\"\']?[^<>]*>.*";
+            }
+            this.pattern = Pattern.compile(regex, PATTERN_FLAGS);
+        }
+
+        boolean matches(byte[] data) {
+            return matches(new String(data));
+        }
+
+        boolean matches(String data) {
+            return pattern.matcher(data).matches();
+        }
+
+        MimeType getType() {
+            return type;
+        }
+
+        String getNameSpaceURI() {
+            return namespaceURI;
+        }
+
+        String getLocalName() {
+            return localName;
+        }
+
+        public String toString() {
+            return new StringBuffer().append(type.getName()).append(", ")
+                    .append(namespaceURI).append(", ").append(localName)
+                    .toString();
+        }
+    }
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypeException.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypeException.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypeException.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypeException.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+/**
+ * A class to encapsulate MimeType related exceptions.
+ * 
+ * @author Hari Kodungallur
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class MimeTypeException extends Exception {
+
+    /**
+     * Constructs a MimeTypeException with no specified detail message.
+     */
+    public MimeTypeException() {
+        super();
+    }
+
+    /**
+     * Constructs a MimeTypeException with the specified detail message.
+     * 
+     * @param msg
+     *            the detail message.
+     */
+    public MimeTypeException(String msg) {
+        super(msg);
+    }
+
+    /**
+     * Constructs a MimeTypeException with the specified cause.
+     * 
+     * @param t
+     *            the cause.
+     */
+    public MimeTypeException(Throwable t) {
+        super(t);
+    }
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,413 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// JDK imports
+import java.io.File;
+import java.net.URL;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.commons.logging.LogFactory;
+import org.w3c.dom.Document;
+
+// Commons Logging imports
+import org.apache.commons.logging.Log;
+
+/**
+ * This class is a MimeType repository. It gathers a set of MimeTypes and
+ * enables to retrieves a content-type from its name, from a file name, or from
+ * a magic character sequence.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public final class MimeTypes {
+
+    /** The default <code>application/octet-stream</code> MimeType */
+    public final static String DEFAULT = "application/octet-stream";
+
+    /** My logger */
+    private Log logger = null;
+
+    /** All the registered MimeTypes indexed on their name */
+    private Map types = new HashMap();
+
+    /** The patterns matcher */
+    private Patterns patterns = new Patterns();
+
+    /** List of all registered magics */
+    private ArrayList magics = new ArrayList();
+
+    /** List of all registered rootXML */
+    private ArrayList xmls = new ArrayList();
+
+    private Map unsolvedDeps = new HashMap();
+
+    /**
+     * A comparator used to sort the mime types based on their magics (it is
+     * sorted first on the magic's priority, then on the magic's size).
+     */
+    final static Comparator MAGICS_COMPARATOR = new Comparator() {
+        public int compare(Object o1, Object o2) {
+            Magic m1 = (Magic) o1;
+            Magic m2 = (Magic) o2;
+            int p1 = m1.getPriority();
+            int p2 = m2.getPriority();
+            if (p1 != p2) {
+                return p2 - p1;
+            }
+            return m2.size() - m1.size();
+        }
+    };
+
+    /**
+     * A comparator used to sort the mime types based on their level (the level
+     * is the number of super-types for a type)
+     */
+    private final static Comparator LEVELS_COMPARATOR = new Comparator() {
+        public int compare(Object o1, Object o2) {
+            return ((MimeInfo) o2).getLevel() - ((MimeInfo) o1).getLevel();
+        }
+    };
+
+    /** The minimum length of data to provide to check all MimeTypes */
+    private int minLength = 0;
+
+    /**
+     * Creates a new MimeTypes instance.
+     * 
+     * @param filepath
+     *            is the mime-types definitions xml file.
+     * @param logger
+     *            is it Logger to uses for ouput messages.
+     */
+    public MimeTypes(String filepath, Log logger) {
+        if (logger == null) {
+            this.logger = LogFactory.getLog(this.getClass());
+        } else {
+            this.logger = logger;
+        }
+        MimeTypesReader reader = new MimeTypesReader(logger);
+        add(reader.read(filepath));
+    }
+
+    /**
+     * Creates a new MimeTypes instance.
+     * 
+     * @param filepath
+     *            is the mime-types definitions xml file.
+     * @return A MimeTypes instance for the specified filepath xml file.
+     */
+    public MimeTypes(String filepath) {
+        this(filepath, (Log) null);
+    }
+
+    /**
+     * Creates a new MimeTypes instance.
+     * 
+     * @param is
+     *            the document of the mime types definition file.
+     * @param logger
+     *            is it Logger to uses for ouput messages.
+     */
+    public MimeTypes(Document doc, Log logger) {
+        if (logger == null) {
+            this.logger = LogFactory.getLog(this.getClass());
+        } else {
+            this.logger = logger;
+        }
+        MimeTypesReader reader = new MimeTypesReader(logger);
+        add(reader.read(doc));
+    }
+
+    /**
+     * Creates a new MimeTypes instance.
+     * 
+     * @param is
+     *            the document of the mime types definition file.
+     */
+    public MimeTypes(Document doc) {
+        this(doc, (Log) null);
+    }
+
+    /**
+     * Find the Mime Content Type of a file.
+     * 
+     * @param file
+     *            to analyze.
+     * @return the Mime Content Type of the specified file, or <code>null</code>
+     *         if none is found.
+     */
+    public MimeType getMimeType(File file) {
+        return getMimeType(file.getName());
+    }
+
+    /**
+     * Find the Mime Content Type of a document from its URL.
+     * 
+     * @param url
+     *            of the document to analyze.
+     * @return the Mime Content Type of the specified document URL, or
+     *         <code>null</code> if none is found.
+     */
+    public MimeType getMimeType(URL url) {
+        return getMimeType(url.getPath());
+    }
+
+    /**
+     * Find the Mime Content Type of a document from its name.
+     * 
+     * @param name
+     *            of the document to analyze.
+     * @return the Mime Content Type of the specified document name, or
+     *         <code>null</code> if none is found.
+     */
+    public MimeType getMimeType(String name) {
+        MimeType type = patterns.matches(name);
+        if (type != null)
+            return type;
+        // if it's null here, then return the default type
+        return forName(DEFAULT);
+    }
+
+    /**
+     * Find the Mime Content Type of a stream from its content.
+     * 
+     * @param data
+     *            are the first bytes of data of the content to analyze.
+     *            Depending on the length of provided data, all known MimeTypes
+     *            are checked. If the length of provided data is greater or
+     *            egals to the value returned by {@link #getMinLength()}, then
+     *            all known MimeTypes are checked, otherwise only the MimeTypes
+     *            that could be analyzed with the length of provided data are
+     *            analyzed.
+     * 
+     * @return The Mime Content Type found for the specified data, or
+     *         <code>null</code> if none is found.
+     * @see #getMinLength()
+     */
+    public MimeType getMimeType(byte[] data) {
+        // Preliminary checks
+        if ((data == null) || (data.length < 1)) {
+            return null;
+        }
+
+        // First, check for XML descriptions (level by level)
+        for (int i = 0; i < xmls.size(); i++) {
+            MimeType type = ((MimeInfo) xmls.get(i)).getType();
+            if (type.matchesXML(data)) {
+                return type;
+            }
+        }
+
+        // Then, check for magic bytes
+        for (int i = 0; i < magics.size(); i++) {
+            Magic magic = (Magic) magics.get(i);
+            if (magic.eval(data)) {
+                return magic.getType();
+            }
+        }
+        return null;
+    }
+
+    /**
+     * Find the Mime Content Type of a document from its name and its content.
+     * The policy used to guess the Mime Content Type is:
+     * <ol>
+     * <li>Try to find the type based on the provided data.</li>
+     * <li>If a type is found, then return it, otherwise try to find the type
+     * based on the file name</li>
+     * </ol>
+     * 
+     * @param name
+     *            of the document to analyze.
+     * @param data
+     *            are the first bytes of the document's content.
+     * @return the Mime Content Type of the specified document, or
+     *         <code>null</code> if none is found.
+     * @see #getMinLength()
+     */
+    public MimeType getMimeType(String name, byte[] data) {
+
+        // First, try to get the mime-type from the content
+        MimeType mimeType = getMimeType(data);
+
+        // If no mime-type found, then try to get the mime-type from
+        // the document name
+        if (mimeType == null) {
+            mimeType = getMimeType(name);
+        }
+        return mimeType;
+    }
+
+    /**
+     * Find a Mime Content Type from its name.
+     * 
+     * @param name
+     *            is the content type name
+     * @return the MimeType for the specified name, or <code>null</code> if no
+     *         MimeType is registered for this name.
+     */
+    public MimeType forName(String name) {
+        MimeInfo info = (MimeInfo) types.get(name);
+        return (info == null) ? null : info.getType();
+    }
+
+    /**
+     * Return the minimum length of data to provide to analyzing methods based
+     * on the document's content in order to check all the known MimeTypes.
+     * 
+     * @return the minimum length of data to provide.
+     * @see #getMimeType(byte[])
+     * @see #getMimeType(String, byte[])
+     */
+    public int getMinLength() {
+        return 1024;
+        // return minLength;
+    }
+
+    /**
+     * Add the specified mime-types in the repository.
+     * 
+     * @param types
+     *            are the mime-types to add.
+     */
+    void add(MimeType[] types) {
+        if (types == null) {
+            return;
+        }
+        for (int i = 0; i < types.length; i++) {
+            add(types[i]);
+        }
+    }
+
+    /**
+     * Add the specified mime-type in the repository.
+     * 
+     * @param type
+     *            is the mime-type to add.
+     */
+    void add(MimeType type) {
+
+        if (type == null) {
+            return;
+        }
+
+        // Add the new type in the repository
+        MimeInfo info = new MimeInfo(type);
+        types.put(info.getName(), info);
+
+        // Checks for some unsolved dependencies on this new type
+        List deps = (List) unsolvedDeps.get(info.getName());
+        if (deps != null) {
+            int level = info.getLevel();
+            for (int i = 0; i < deps.size(); i++) {
+                level = Math
+                        .max(level, ((MimeInfo) deps.get(i)).getLevel() + 1);
+            }
+            info.setLevel(level);
+            unsolvedDeps.remove(info.getName());
+        }
+
+        // Checks if some of my super-types are not already solved
+        String[] superTypes = type.getSuperTypes();
+        for (int i = 0; i < superTypes.length; i++) {
+            MimeInfo superType = (MimeInfo) types.get(superTypes[i]);
+            if (superType == null) {
+                deps = (List) unsolvedDeps.get(superTypes[i]);
+                if (deps == null) {
+                    deps = new ArrayList();
+                    unsolvedDeps.put(superTypes[i], deps);
+                }
+                deps.add(info);
+            }
+        }
+
+        // Update minLentgth
+        minLength = Math.max(minLength, type.getMinLength());
+        // Update the extensions index...
+        patterns.add(type.getPatterns(), type);
+        // Update the magics index...
+        if (type.hasMagic()) {
+            Magic[] magics = type.getMagics();
+            for (int i = 0; i < magics.length; i++) {
+                this.magics.add(magics[i]);
+            }
+        }
+        Collections.sort(magics, MAGICS_COMPARATOR);
+
+        // Update the xml (xmlRoot) index...
+        if (type.hasRootXML()) {
+            this.xmls.add(info);
+        }
+        Collections.sort(xmls, LEVELS_COMPARATOR);
+    }
+
+    // Inherited Javadoc
+    public String toString() {
+        StringBuffer buf = new StringBuffer();
+        Iterator iter = types.values().iterator();
+        while (iter.hasNext()) {
+            MimeType type = ((MimeInfo) iter.next()).getType();
+            buf.append(type).append("\n");
+        }
+        return buf.toString();
+    }
+
+    private final class MimeInfo {
+
+        private MimeType type = null;
+
+        private int level = 0;
+
+        MimeInfo(MimeType type) {
+            this.type = type;
+            this.level = 0;
+        }
+
+        MimeType getType() {
+            return type;
+        }
+
+        int getLevel() {
+            return level;
+        }
+
+        void setLevel(int level) {
+            if (level <= this.level) {
+                return;
+            }
+
+            this.level = level;
+            // Update all my super-types
+            String[] supers = type.getSuperTypes();
+            for (int i = 0; i < supers.length; i++) {
+                MimeInfo sup = (MimeInfo) types.get(supers[i]);
+                if (sup != null) {
+                    sup.setLevel(level + 1);
+                }
+            }
+        }
+
+        String getName() {
+            return type.getName();
+        }
+    }
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,383 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// Commons Logging imports
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+// DOM imports
+import org.w3c.dom.Attr;
+import org.w3c.dom.Node;
+import org.w3c.dom.Element;
+import org.w3c.dom.Document;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.NamedNodeMap;
+import org.xml.sax.InputSource;
+
+// JDK imports
+import java.io.InputStream;
+import java.util.ArrayList;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+/**
+ * A reader for XML files compliant with the freedesktop MIME-info DTD.
+ * 
+ * <pre>
+ *  &lt;!DOCTYPE mime-info [
+ *    &lt;!ELEMENT mime-info (mime-type)+&gt;
+ *    &lt;!ATTLIST mime-info xmlns CDATA #FIXED &quot;http://www.freedesktop.org/standards/shared-mime-info&quot;&gt;
+ * 
+ *    &lt;!ELEMENT mime-type (comment|acronym|expanded-acronym|glob|magic|root-XML|alias|sub-class-of)*&gt;
+ *    &lt;!ATTLIST mime-type type CDATA #REQUIRED&gt;
+ * 
+ *    &lt;!-- a comment describing a document with the respective MIME type. Example: &quot;WMV video&quot; --&gt;
+ *    &lt;!ELEMENT comment (#PCDATA)&gt;
+ *    &lt;!ATTLIST comment xml:lang CDATA #IMPLIED&gt;
+ * 
+ *    &lt;!-- a comment describing a the respective unexpanded MIME type acronym. Example: &quot;WMV&quot; --&gt;
+ *    &lt;!ELEMENT acronym (#PCDATA)&gt;
+ *    &lt;!ATTLIST acronym xml:lang CDATA #IMPLIED&gt;
+ * 
+ *    &lt;!-- a comment describing a the respective unexpanded MIME type acronym. Example: &quot;Windows Media Video&quot; --&gt;
+ *    &lt;!ELEMENT expanded-acronym (#PCDATA)&gt;
+ *    &lt;!ATTLIST expanded-acronym xml:lang CDATA #IMPLIED&gt;
+ * 
+ *    &lt;!ELEMENT glob EMPTY&gt;
+ *    &lt;!ATTLIST glob pattern CDATA #REQUIRED&gt;
+ * 
+ *    &lt;!ELEMENT magic (match)+&gt;
+ *    &lt;!ATTLIST magic priority CDATA #IMPLIED&gt;
+ * 
+ *    &lt;!ELEMENT match (match)*&gt;
+ *    &lt;!ATTLIST match offset CDATA #REQUIRED&gt;
+ *    &lt;!ATTLIST match type (string|big16|big32|little16|little32|host16|host32|byte) #REQUIRED&gt;
+ *    &lt;!ATTLIST match value CDATA #REQUIRED&gt;
+ *    &lt;!ATTLIST match mask CDATA #IMPLIED&gt;
+ * 
+ *    &lt;!ELEMENT root-XML EMPTY&gt;
+ *    &lt;!ATTLIST root-XML
+ *          namespaceURI CDATA #REQUIRED
+ *          localName CDATA #REQUIRED&gt;
+ * 
+ *    &lt;!ELEMENT alias EMPTY&gt;
+ *    &lt;!ATTLIST alias
+ *          type CDATA #REQUIRED&gt;
+ * 
+ *   &lt;!ELEMENT sub-class-of EMPTY&gt;
+ *   &lt;!ATTLIST sub-class-of
+ *         type CDATA #REQUIRED&gt;
+ *  ]&gt;
+ * </pre>
+ * 
+ * 
+ * @see http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec
+ * @author J&eacute;r&ocirs;me Charron
+ */
+final class MimeTypesReader {
+
+    /** The logger to use */
+    private Log logger = null;
+
+    MimeTypesReader() {
+        this(null);
+    }
+
+    MimeTypesReader(Log logger) {
+        if (logger == null) {
+            this.logger = LogFactory.getLog(this.getClass());
+        } else {
+            this.logger = logger;
+        }
+    }
+
+    MimeType[] read(String filepath) {
+        return read(MimeTypesReader.class.getClassLoader().getResourceAsStream(
+                filepath));
+    }
+
+    MimeType[] read(InputStream stream) {
+        MimeType[] types = null;
+        try {
+            DocumentBuilderFactory factory = DocumentBuilderFactory
+                    .newInstance();
+            DocumentBuilder builder = factory.newDocumentBuilder();
+            Document document = builder.parse(new InputSource(stream));
+            types = read(document);
+        } catch (Exception e) {
+            if (logger.isWarnEnabled()) {
+                logger.warn(e.toString() + " while loading mime-types");
+            }
+            types = new MimeType[0];
+        }
+        return types;
+    }
+
+    MimeType[] read(Document document) {
+        // printDOM(document);
+        MimeType[] types = null;
+        Element element = document.getDocumentElement();
+        if ((element != null) && element.getTagName().equals("mime-info")) {
+            types = readMimeInfo(element);
+        }
+        return (types == null) ? (new MimeType[0]) : types;
+    }
+
+    /** Read Element named mime-info. */
+    private MimeType[] readMimeInfo(Element element) {
+        ArrayList types = new ArrayList();
+        NodeList nodes = element.getChildNodes();
+        for (int i = 0; i < nodes.getLength(); i++) {
+            Node node = nodes.item(i);
+            if (node.getNodeType() == Node.ELEMENT_NODE) {
+                Element nodeElement = (Element) node;
+                if (nodeElement.getTagName().equals("mime-type")) {
+                    MimeType type = readMimeType(nodeElement);
+                    if (type != null) {
+                        types.add(type);
+                    }
+                }
+            }
+        }
+        return (MimeType[]) types.toArray(new MimeType[types.size()]);
+    }
+
+    /** Read Element named mime-type. */
+    private MimeType readMimeType(Element element) {
+
+        MimeType type = null;
+
+        try {
+            type = new MimeType(element.getAttribute("type"));
+        } catch (MimeTypeException mte) {
+            // Mime Type not valid... just ignore it
+            if (logger.isInfoEnabled()) {
+                logger.info(mte.toString() + " ... Ignoring!");
+            }
+            return null;
+        }
+
+        NodeList nodes = element.getChildNodes();
+        for (int i = 0; i < nodes.getLength(); i++) {
+            Node node = nodes.item(i);
+            if (node.getNodeType() == Node.ELEMENT_NODE) {
+                Element nodeElement = (Element) node;
+                if (nodeElement.getTagName().equals("_comment")) {
+                    type.setDescription(nodeElement.getFirstChild()
+                            .getNodeValue());
+                } else if (nodeElement.getTagName().equals("glob")) {
+                    readGlob(nodeElement, type);
+                } else if (nodeElement.getTagName().equals("magic")) {
+                    readMagic(nodeElement, type);
+                } else if (nodeElement.getTagName().equals("alias")) {
+                    readAlias(nodeElement, type);
+                } else if (nodeElement.getTagName().equals("root-XML")) {
+                    readRootXML(nodeElement, type);
+                } else if (nodeElement.getTagName().equals("sub-class-of")) {
+                    readSubClassOf(nodeElement, type);
+                }
+            }
+        }
+        return type;
+    }
+
+    /** Read Element named glob. */
+    private void readGlob(Element element, MimeType type) {
+        type.addPattern(element.getAttribute("pattern"));
+    }
+
+    /** Read Element named alias. */
+    private void readAlias(Element element, MimeType type) {
+        type.addAlias(element.getAttribute("type"));
+    }
+
+    /** Read Element named magic. */
+    private void readMagic(Element element, MimeType mimeType) {
+
+        Magic magic = null;
+        try {
+            magic = new Magic(Integer
+                    .parseInt(element.getAttribute("priority")));
+        } catch (Exception e) {
+            magic = new Magic();
+        }
+        magic.setType(mimeType);
+        magic.setClause(readMatches(element));
+        mimeType.addMagic(magic);
+    }
+
+    private Clause readMatches(Element element) {
+        Clause sub = null;
+        Clause prev = Clause.FALSE;
+        Clause clause = null;
+        NodeList nodes = element.getChildNodes();
+        for (int i = 0; i < nodes.getLength(); i++) {
+            Node node = nodes.item(i);
+            if (node.getNodeType() == Node.ELEMENT_NODE) {
+                Element nodeElement = (Element) node;
+                if (nodeElement.getTagName().equals("match")) {
+                    sub = readMatches(nodeElement);
+                    try {
+                        if (sub != null) {
+                            clause = new MagicClause(Operator.AND,
+                                    readMatch(nodeElement), sub);
+                        } else {
+                            clause = readMatch(nodeElement);
+                        }
+                        clause = new MagicClause(Operator.OR, prev, clause);
+                        prev = clause;
+                    } catch (MimeTypeException mte) {
+                        logger.warn(mte + " while reading magic-match ["
+                                + nodeElement + "], Ignoring!");
+                    }
+                }
+            }
+        }
+        return clause;
+    }
+
+    /** Read Element named match. */
+    private MagicMatch readMatch(Element element) throws MimeTypeException {
+
+        String offset = null;
+        String value = null;
+        String mask = null;
+        String type = null;
+
+        NamedNodeMap attrs = element.getAttributes();
+        for (int i = 0; i < attrs.getLength(); i++) {
+            Attr attr = (Attr) attrs.item(i);
+            if (attr.getName().equals("offset")) {
+                offset = attr.getValue();
+            } else if (attr.getName().equals("type")) {
+                type = attr.getValue();
+            } else if (attr.getName().equals("value")) {
+                value = attr.getValue();
+            } else if (attr.getName().equals("mask")) {
+                mask = attr.getValue();
+            }
+        }
+        // Parse OffSet
+        String[] offsets = offset.split(":");
+        int offStart = 0;
+        int offEnd = 0;
+        try {
+            offStart = Integer.parseInt(offsets[0]);
+        } catch (Exception e) {
+            // WARN log + avoid loading
+        }
+        try {
+            offEnd = Integer.parseInt(offsets[1]);
+        } catch (Exception e) {
+            // WARN log
+        }
+        offEnd = Math.max(offStart, offEnd);
+
+        return new MagicMatch(offStart, offEnd, type, mask, value);
+    }
+
+    /** Read Element named root-XML. */
+    private void readRootXML(Element element, MimeType mimeType) {
+
+        mimeType.addRootXML(element.getAttribute("namespaceURI"), element
+                .getAttribute("localName"));
+    }
+
+    /** Read Element named sub-class-of. */
+    private void readSubClassOf(Element element, MimeType mimeType) {
+
+        mimeType.addSuperType(element.getAttribute("type"));
+    }
+
+    /** Prints the specified node, then prints all of its children. */
+    public static void printDOM(Node node) {
+        int type = node.getNodeType();
+        switch (type) {
+        // print the document element
+        case Node.DOCUMENT_NODE: {
+            System.out.println("&lt;?xml version=\"1.0\" ?>");
+            printDOM(((Document) node).getDocumentElement());
+            break;
+        }
+
+            // print element with attributes
+        case Node.ELEMENT_NODE: {
+            System.out.print("<");
+            System.out.print(node.getNodeName());
+            NamedNodeMap attrs = node.getAttributes();
+            for (int i = 0; i < attrs.getLength(); i++) {
+                Node attr = attrs.item(i);
+                System.out.print(" " + attr.getNodeName().trim() + "=\""
+                        + attr.getNodeValue().trim() + "\"");
+            }
+            System.out.println(">");
+
+            NodeList children = node.getChildNodes();
+            if (children != null) {
+                int len = children.getLength();
+                for (int i = 0; i < len; i++)
+                    printDOM(children.item(i));
+            }
+
+            break;
+        }
+
+            // handle entity reference nodes
+        case Node.ENTITY_REFERENCE_NODE: {
+            System.out.print("&");
+            System.out.print(node.getNodeName().trim());
+            System.out.print(";");
+            break;
+        }
+
+            // print cdata sections
+        case Node.CDATA_SECTION_NODE: {
+            System.out.print("<![CDATA[");
+            System.out.print(node.getNodeValue().trim());
+            System.out.print("]]>");
+            break;
+        }
+
+            // print text
+        case Node.TEXT_NODE: {
+            System.out.print(node.getNodeValue().trim());
+            break;
+        }
+
+            // print processing instruction
+        case Node.PROCESSING_INSTRUCTION_NODE: {
+            System.out.print("<?");
+            System.out.print(node.getNodeName().trim());
+            String data = node.getNodeValue().trim();
+            {
+                System.out.print(" ");
+                System.out.print(data);
+            }
+            System.out.print("?>");
+            break;
+        }
+        }
+
+        if (type == Node.ELEMENT_NODE) {
+            System.out.println();
+            System.out.print("</");
+            System.out.print(node.getNodeName().trim());
+            System.out.print('>');
+        }
+    }
+
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeUtils.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,145 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// JDK imports
+import java.io.InputStream;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import org.w3c.dom.Document;
+import org.xml.sax.InputSource;
+
+// Tika imports
+import org.apache.tika.utils.Configurable;
+import org.apache.tika.utils.Configuration;
+import org.apache.tika.metadata.TikaMimeKeys;
+
+/**
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class MimeUtils implements Configurable, TikaMimeKeys {
+
+    /** My logger */
+    private final static Logger LOG = Logger.getLogger(MimeUtils.class
+            .getName());
+
+    /** The key used to cache the mime repository in conf */
+    private final static String KEY = MimeUtils.class.getName();
+
+    /** My current configuration */
+    private Configuration conf = null;
+
+    /** A flag that tells if magic resolution must be performed */
+    private boolean magic = true;
+
+    /** The MimeTypes repository instance */
+    private MimeTypes repository = null;
+
+    /** Creates a new instance of MimeUtils */
+    public MimeUtils(Configuration conf) {
+        setConf(conf);
+    }
+
+    /***************************************************************************
+     * ----------------------------- <implementation:Configurable> *
+     * -----------------------------
+     */
+
+    public void setConf(Configuration conf) {
+        this.conf = conf;
+        this.magic = conf.getBoolean(MIME_TYPE_MAGIC, true);
+        this.repository = (MimeTypes) conf.getObject(KEY);
+        if (repository == null) {
+            repository = load(conf.get(TIKA_MIME_FILE));
+            conf.setObject(KEY, repository);
+        }
+    }
+
+    public Configuration getConf() {
+        return this.conf;
+    }
+
+    /***************************************************************************
+     * ----------------------------- </implementation:Configurable> *
+     * -----------------------------
+     */
+
+    public final MimeTypes getRepository() {
+        return repository;
+    }
+
+    public String getType(String typeName, String url, byte[] data) {
+        MimeType type = null;
+        try {
+            typeName = MimeType.clean(typeName);
+            type = typeName == null ? null : repository.forName(typeName);
+        } catch (MimeTypeException mte) {
+            // Seems to be a malformed mime type name...
+        }
+
+        if (typeName == null || type == null || !type.matches(url)) {
+            // If no mime-type header, or cannot find a corresponding registered
+            // mime-type, or the one found doesn't match the url pattern
+            // it shouldbe, then guess a mime-type from the url pattern
+            type = repository.getMimeType(url);
+            typeName = type == null ? typeName : type.getName();
+        }
+        // if (typeName == null || type == null ||
+        // (this.magic && type.hasMagic() && !type.matches(data))) {
+        // If no mime-type already found, or the one found doesn't match
+        // the magic bytes it should be, then, guess a mime-type from the
+        // document content (magic bytes)
+        type = repository.getMimeType(data);
+        typeName = type == null ? typeName : type.getName();
+        // }
+        return typeName;
+    }
+
+    private final MimeTypes load(String tikaMimeFile) {
+        LOG.info("Loading [" + tikaMimeFile + "]");
+        Document document = getDocumentRoot(MimeUtils.class.getClassLoader()
+                .getResourceAsStream(tikaMimeFile));
+
+        MimeTypes types = new MimeTypes(document);
+        return types;
+    }
+
+    private final Document getDocumentRoot(InputStream is) {
+        // open up the XML file
+        DocumentBuilderFactory factory = null;
+        DocumentBuilder parser = null;
+        Document document = null;
+        InputSource inputSource = null;
+
+        inputSource = new InputSource(is);
+
+        try {
+            factory = DocumentBuilderFactory.newInstance();
+            parser = factory.newDocumentBuilder();
+            document = parser.parse(inputSource);
+        } catch (Exception e) {
+            LOG.log(Level.WARNING, "Unable to parse xml stream"
+                    + ": Reason is [" + e + "]");
+            return null;
+        }
+
+        return document;
+    }
+
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/Operator.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/Operator.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/Operator.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/Operator.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,68 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+/**
+ * Defines a Boolean Binary Operator.
+ * 
+ * @author Jerome Charron
+ */
+interface Operator {
+
+    /** The OR Boolean operator */
+    final static Operator OR = new Or();
+
+    /** The AND Boolean operator */
+    final static Operator AND = new And();
+
+    /**
+     * Evaluates the specified bolean operands.
+     * 
+     * @param o1
+     *            is the first boolean operand.
+     * @param o2
+     *            is the second boolean operand.
+     * @return the value of this boolean operator applied on the specified
+     *         boolean operands.
+     */
+    boolean eval(boolean o1, boolean o2);
+
+    /**
+     * Defines the Boolean Binary Operator AND.
+     */
+    final static class And implements Operator {
+        public boolean eval(boolean o1, boolean o2) {
+            return o1 && o2;
+        }
+
+        public String toString() {
+            return "AND";
+        }
+    }
+
+    /**
+     * Defines the Boolean Binary Operator OR.
+     */
+    final static class Or implements Operator {
+        public boolean eval(boolean o1, boolean o2) {
+            return o1 || o2;
+        }
+
+        public String toString() {
+            return "OR";
+        }
+    }
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,193 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// JDK imports
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+/**
+ * Defines a MimeType pattern.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+class Patterns {
+
+    private static Map escapeMap = new HashMap();
+    static {
+        escapeMap.put("\\", "\\\\");
+        escapeMap.put("?", "\\?");
+        escapeMap.put("[", "\\[");
+        escapeMap.put("]", "\\]");
+        escapeMap.put("^", "\\^");
+        escapeMap.put(".", "\\.");
+        escapeMap.put("-", "\\-");
+        escapeMap.put("$", "\\$");
+        escapeMap.put("+", "\\+");
+        escapeMap.put("(", "\\(");
+        escapeMap.put(")", "\\)");
+        escapeMap.put("{", "\\{");
+        escapeMap.put("}", "\\}");
+        escapeMap.put("|", "\\|");
+        escapeMap.put("*", ".*");
+    }
+
+    /** Gathers all the patterns */
+    private ArrayList patterns = new ArrayList();
+
+    /** An index of exact matching patterns */
+    private Map exactIdx = new HashMap();
+
+    /** An index of the patterns of the form "*.ext" */
+    private Map extIdx = new HashMap();
+
+    /** A list of other patterns */
+    private Map others = new HashMap();
+
+    /** Creates a new instance of Patterns */
+    Patterns() {
+    }
+
+    void add(String[] patterns, MimeType type) {
+        // Some preliminary checks
+        if ((patterns == null) || (type == null)) {
+            return;
+        }
+        // All is ok, so add the patterns
+        for (int i = 0; i < patterns.length; i++) {
+            add(patterns[i], type);
+        }
+    }
+
+    void add(String pattern, MimeType type) {
+        // Some preliminary checks
+        if ((pattern == null) || (type == null)) {
+            return;
+        }
+
+        // Add the pattern in the good index
+        if ((pattern.indexOf('*') == -1) && (pattern.indexOf('?') == -1)
+                && (pattern.indexOf('[') == -1)) {
+            exactIdx.put(pattern, type);
+
+        } else if (pattern.startsWith("*.")) {
+            extIdx.put(pattern.substring(2), type);
+
+        } else {
+            others.put(escape(pattern), type);
+        }
+        // Add the pattern in the list of patterns
+        patterns.add(pattern);
+    }
+
+    String[] getPatterns() {
+        return (String[]) patterns.toArray(new String[patterns.size()]);
+    }
+
+    /**
+     * Find the MimeType corresponding to a filename.
+     * 
+     * It applies the recommandations detailed in FreeDesktop Shared MIME-info
+     * Database for guessing MimeType from a filename: It first try a
+     * case-sensitive match, then try again with the filename converted to
+     * lower-case if that fails. If several patterns match then the longest
+     * pattern is used. In particular, files with multiple extensions (such as
+     * Data.tar.gz) match the longest sequence of extensions (eg '*.tar.gz' in
+     * preference to '*.gz'). Literal patterns (eg, 'Makefile') are matched
+     * before all others. Patterns beginning with `*.' and containing no other
+     * special characters (`*?[') are matched before other wildcarded patterns
+     * (since this covers the majority of the patterns).
+     */
+    MimeType matches(String filename) {
+
+        // Preliminary check...
+        if (filename == null) {
+            return null;
+        }
+
+        // First, try exact match of the provided filename
+        MimeType type = (MimeType) exactIdx.get(filename);
+        if (type != null) {
+            return type;
+        }
+
+        // Then try exact match with only the filename
+        String str = last(filename, '/');
+        if (str != null) {
+            type = (MimeType) exactIdx.get(str);
+            if (type != null) {
+                return type;
+            }
+        }
+        str = last(filename, '\\');
+        if (str != null) {
+            type = (MimeType) exactIdx.get(str);
+            if (type != null) {
+                return type;
+            }
+        }
+
+        // Then try "extension" (*.xxx) matching
+        int idx = filename.indexOf('.', 0);
+        while (idx != -1) {
+            type = (MimeType) extIdx.get(filename.substring(idx + 1));
+            if (type != null) {
+                return type;
+            }
+            idx = filename.indexOf('.', idx + 1);
+        }
+
+        // And finally, try complex regexp matching
+        String longest = null;
+        Iterator iter = others.keySet().iterator();
+        while (iter.hasNext()) {
+            String pattern = (String) iter.next();
+            if ((filename.matches(pattern))
+                    && (pattern.length() > longest.length())) {
+                longest = pattern;
+            }
+        }
+        if (longest != null) {
+            type = (MimeType) others.get(longest);
+        }
+        return type;
+    }
+
+    private final static String last(String str, char c) {
+        if (str == null) {
+            return null;
+        }
+        int idx = str.lastIndexOf(c);
+        if ((idx < 0) || (idx >= (str.length() - 1))) {
+            return null;
+        }
+        return str.substring(idx + 1);
+    }
+
+    private final static String escape(String str) {
+        char[] chars = str.toCharArray();
+        StringBuffer result = new StringBuffer(str.length());
+        for (int i = 0; i < str.length(); i++) {
+            String charAt = String.valueOf(str.charAt(i));
+            String replace = (String) escapeMap.get(charAt);
+            result.append((replace != null) ? replace : charAt);
+        }
+        return result.toString();
+    }
+
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configurable.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configurable.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configurable.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configurable.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.utils;
+
+/**
+ * @author mattmann
+ * @version $Revision$
+ * 
+ * <p>
+ * An interface allowing a Tika object to be <code>Configured</code> by a
+ * {@link Configuration} object. Based on Apache Hadoop's configuration
+ * interface.
+ * </p>.
+ */
+public interface Configurable {
+
+    /**
+     * Configures the Tika object with the provided {@link Configuration} named
+     * <code>conf</code>.
+     * 
+     * @param conf
+     *            The object's new {@link Configuration}.
+     */
+    public void setConf(Configuration conf);
+
+    /**
+     * 
+     * @return The Tika object's existing {@link Configuration}.
+     */
+    public Configuration getConf();
+
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configuration.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configuration.java?rev=578161&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configuration.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/Configuration.java Fri Sep 21 08:07:58 2007
@@ -0,0 +1,176 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+// JDK imports
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+import java.util.StringTokenizer;
+
+/**
+ * Provides access to configuration parameters.
+ */
+public class Configuration {
+
+    private Properties properties;
+
+    /** A new configuration. */
+    public Configuration() {
+        this.properties = new Properties();
+    }
+
+    /** A new configuration with the same settings cloned from another. */
+    public Configuration(Properties properties) {
+        if (properties != null) {
+            this.properties = (Properties) properties.clone();
+        } else {
+            this.properties = new Properties();
+        }
+    }
+
+    /**
+     * Returns the value of the <code>name</code> property, or null if no such
+     * property exists.
+     */
+    public Object getObject(String name) {
+        return properties.get(name);
+    }
+
+    /** Sets the value of the <code>name</code> property. */
+    public void setObject(String name, Object value) {
+        properties.put(name, value);
+    }
+
+    /**
+     * Returns the value of the <code>name</code> property. If no such
+     * property exists, then <code>defaultValue</code> is returned.
+     */
+    public Object get(String name, Object defaultValue) {
+        Object res = getObject(name);
+        return (res != null) ? res : defaultValue;
+    }
+
+    /**
+     * Returns the value of the <code>name</code> property, or null if no such
+     * property exists.
+     */
+    public String get(String name) {
+        return properties.getProperty(name);
+    }
+
+    /** Sets the value of the <code>name</code> property. */
+    public void set(String name, Object value) {
+        properties.setProperty(name, value.toString());
+    }
+
+    /**
+     * Returns the value of the <code>name</code> property. If no such
+     * property exists, then <code>defaultValue</code> is returned.
+     */
+    public String get(String name, String defaultValue) {
+        return properties.getProperty(name, defaultValue);
+    }
+
+    /**
+     * Returns the value of the <code>name</code> property as an integer. If
+     * no such property is specified, or if the specified value is not a valid
+     * integer, then <code>defaultValue</code> is returned.
+     */
+    public int getInt(String name, int defaultValue) {
+        try {
+            return Integer.parseInt(get(name));
+        } catch (Exception e) {
+            return defaultValue;
+        }
+    }
+
+    /** Sets the value of the <code>name</code> property to an integer. */
+    public void setInt(String name, int value) {
+        set(name, Integer.toString(value));
+    }
+
+    /**
+     * Returns the value of the <code>name</code> property as a long. If no
+     * such property is specified, or if the specified value is not a valid
+     * long, then <code>defaultValue</code> is returned.
+     */
+    public long getLong(String name, long defaultValue) {
+        try {
+            return Long.parseLong(get(name));
+        } catch (Exception e) {
+            return defaultValue;
+        }
+    }
+
+    /** Sets the value of the <code>name</code> property to a long. */
+    public void setLong(String name, long value) {
+        set(name, Long.toString(value));
+    }
+
+    /**
+     * Returns the value of the <code>name</code> property as a float. If no
+     * such property is specified, or if the specified value is not a valid
+     * float, then <code>defaultValue</code> is returned.
+     */
+    public float getFloat(String name, float defaultValue) {
+        try {
+            return Float.parseFloat(get(name));
+        } catch (Exception e) {
+            return defaultValue;
+        }
+    }
+
+    /**
+     * Returns the value of the <code>name</code> property as an boolean. If
+     * no such property is specified, or if the specified value is not a valid
+     * boolean, then <code>defaultValue</code> is returned. Valid boolean
+     * values are "true" and "false".
+     */
+    public boolean getBoolean(String name, boolean defaultValue) {
+        String valueString = get(name);
+        if ("true".equals(valueString)) {
+            return true;
+        } else if ("false".equals(valueString)) {
+            return false;
+        } else {
+            return defaultValue;
+        }
+    }
+
+    /** Sets the value of the <code>name</code> property to an integer. */
+    public void setBoolean(String name, boolean value) {
+        set(name, Boolean.toString(value));
+    }
+
+    /**
+     * Returns the value of the <code>name</code> property as an array of
+     * strings. If no such property is specified, then <code>null</code> is
+     * returned. Values are comma delimited.
+     */
+    public String[] getStrings(String name) {
+        String valueString = get(name);
+        if (valueString == null)
+            return null;
+        StringTokenizer tokenizer = new StringTokenizer(valueString, ",");
+        List values = new ArrayList();
+        while (tokenizer.hasMoreTokens()) {
+            values.add(tokenizer.nextToken());
+        }
+        return (String[]) values.toArray(new String[values.size()]);
+    }
+
+}