You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/05/20 23:17:56 UTC

svn commit: r776859 - in /lucene/tika/trunk/tika-core/src: main/java/org/apache/tika/detect/ main/java/org/apache/tika/mime/ main/resources/mime/ test/java/org/apache/tika/mime/ test/resources/org/ test/resources/org/apache/ test/resources/org/apache/t...

Author: jukka
Date: Wed May 20 21:17:56 2009
New Revision: 776859

URL: http://svn.apache.org/viewvc?rev=776859&view=rev
Log:
TIKA-225: [PATCH] Various bugfixes for MIME detection

Applied the patch contributed by Jeremias Maerki

Added:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
    lucene/tika/trunk/tika-core/src/test/resources/org/
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles-with-prefix.svg
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles.svg
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/datamatrix.png   (with props)
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/stylesheet.xsl
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-iso-8859-1.xml
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-long-comment.xml
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16be.xml   (with props)
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16le.xml   (with props)
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8.xml
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test.html
Modified:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
    lucene/tika/trunk/tika-core/src/main/resources/mime/tika-mimetypes.xml

Added: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java (added)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java Wed May 20 21:17:56 2009
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.InputStream;
+
+import javax.xml.namespace.QName;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Utility class that uses a {@link SAXParser} to determine the namespace URI and local name of
+ * the root element of an XML file.
+ *
+ * @since Apache Tika 0.4
+ */
+public class XmlRootExtractor {
+
+    public static QName extractRootElement(byte[] data) {
+        SAXParserFactory parserFactory = SAXParserFactory.newInstance();
+        parserFactory.setNamespaceAware(true);
+        parserFactory.setValidating(false);
+
+        ExtractorHandler handler = new ExtractorHandler();
+        try {
+            SAXParser parser = parserFactory.newSAXParser();
+            InputStream in = new java.io.ByteArrayInputStream(data);
+            parser.parse(in, handler);
+        } catch (Exception e) {
+            //ignore
+        }
+        return handler.rootElement;
+    }
+
+    private static class ExtractorHandler extends DefaultHandler {
+
+        private QName rootElement;
+
+        /** @inheritDoc */
+        @Override
+        public void startElement(String uri, String localName, String name, Attributes attributes)
+                throws SAXException {
+            this.rootElement = new QName(uri, localName);
+            throw new SAXException("Aborting: root element received");
+        }
+
+    }
+
+}

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java?rev=776859&r1=776858&r2=776859&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java Wed May 20 21:17:56 2009
@@ -131,7 +131,7 @@
 
     /**
      * Returns the name of this media type.
-     * 
+     *
      * @return media type name (lower case)
      */
     public String getName() {
@@ -189,13 +189,13 @@
                     return true;
                 }
             }
-            return false; 
+            return false;
         }
     }
 
     /**
      * Returns the description of this media type.
-     * 
+     *
      * @return media type description
      */
     public String getDescription() {
@@ -204,7 +204,7 @@
 
     /**
      * Set the description of this media type.
-     * 
+     *
      * @param description media type description
      */
     public void setDescription(String description) {
@@ -245,7 +245,7 @@
 
     /**
      * Add some rootXML info to this mime-type
-     * 
+     *
      * @param namespaceURI
      * @param localName
      */
@@ -265,6 +265,15 @@
         return false;
     }
 
+    boolean matchesXML(String namespaceURI, String localName) {
+        for (RootXML xml : rootXML) {
+            if (xml.matches(namespaceURI, localName)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     boolean hasRootXML() {
         return (rootXML.size() > 0);
     }
@@ -353,6 +362,23 @@
             return pattern.matcher(data).matches();
         }
 
+        boolean matches(String namespaceURI, String localName) {
+            //Compare namespaces
+            if (!(StringUtil.isEmpty(this.namespaceURI))) {
+                if (!this.namespaceURI.equals(namespaceURI)) {
+                    return false;
+                }
+            }
+
+            //Compare root element's local name
+            if (!StringUtil.isEmpty(this.localName)) {
+                if (!this.localName.equals(localName)) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
         MimeType getType() {
             return type;
         }

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=776859&r1=776858&r2=776859&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java Wed May 20 21:17:56 2009
@@ -23,12 +23,15 @@
 import java.io.InputStream;
 import java.net.URL;
 import java.util.Arrays;
-import java.util.Map;
 import java.util.HashMap;
+import java.util.Map;
 import java.util.SortedSet;
 import java.util.TreeSet;
 
+import javax.xml.namespace.QName;
+
 import org.apache.tika.detect.Detector;
+import org.apache.tika.detect.XmlRootExtractor;
 import org.apache.tika.metadata.Metadata;
 
 /**
@@ -127,7 +130,7 @@
 
     /**
      * Find the Mime Content Type of a file.
-     * 
+     *
      * @param file
      *            to analyze.
      * @return the Mime Content Type of the specified file, or <code>null</code>
@@ -139,7 +142,7 @@
 
     /**
      * Find the Mime Content Type of a document from its URL.
-     * 
+     *
      * @param url
      *            of the document to analyze.
      * @return the Mime Content Type of the specified document URL, or
@@ -152,7 +155,7 @@
     /**
      * Find the Mime Content Type of a document from its name.
      * Returns application/octet-stream if no better match is found.
-     * 
+     *
      * @param name of the document to analyze.
      * @return the Mime Content Type of the specified document name
      */
@@ -186,6 +189,7 @@
         }
 
         // First, check for XML descriptions (level by level)
+        // Problem: Regexp matching doesn't work for all XML encodings
         for (MimeType type : xmls) {
             if (type.matchesXML(data)) {
                 return type;
@@ -193,12 +197,32 @@
         }
 
         // Then, check for magic bytes
+        MimeType result = null;
         for (Magic magic : magics) {
             if (magic.eval(data)) {
-                return magic.getType();
+                result = magic.getType();
+                break;
+            }
+        }
+        if (result != null) {
+            // When detecting generic XML, parse XML to determine the root element
+            if ("application/xml".equals(result.getName())) {
+                QName rootElement = XmlRootExtractor.extractRootElement(data);
+                if (rootElement != null) {
+                    for (MimeType type : xmls) {
+                        if (type.matchesXML(
+                                rootElement.getNamespaceURI(),
+                                rootElement.getLocalPart())) {
+                            result = type;
+                            break;
+                        }
+                    }
+                }
             }
+            return result;
         }
 
+
         // Finally, assume plain text if no control bytes are found
         for (int i = 0; i < data.length; i++) {
             int b = data[i] & 0xFF; // prevent sign extension
@@ -302,7 +326,7 @@
      * <li>If a type is found, then return it, otherwise try to find the type
      * based on the file name</li>
      * </ol>
-     * 
+     *
      * @param name
      *            of the document to analyze.
      * @param data
@@ -389,7 +413,7 @@
     /**
      * Adds a file name pattern for the given media type. Assumes that the
      * pattern being added is <b>not</b> a JDK standard regular expression.
-     * 
+     *
      * @param type
      *            media type
      * @param pattern
@@ -408,7 +432,7 @@
      * regular expression via the <code>isRegex</code> parameter. If the value
      * is set to true, then a JDK standard regex is assumed, otherwise the
      * freedesktop glob type is assumed.
-     * 
+     *
      * @param type
      *            media type
      * @param pattern
@@ -418,7 +442,7 @@
      *            false.
      * @throws MimeTypeException
      *             if the pattern conflicts with existing ones.
-     * 
+     *
      */
     public void addPattern(MimeType type, String pattern, boolean isRegex)
             throws MimeTypeException {
@@ -428,7 +452,7 @@
     /**
      * Return the minimum length of data to provide to analyzing methods based
      * on the document's content in order to check all the known MimeTypes.
-     * 
+     *
      * @return the minimum length of data to provide.
      * @see #getMimeType(byte[])
      * @see #getMimeType(String, byte[])
@@ -440,7 +464,7 @@
 
     /**
      * Add the specified mime-type in the repository.
-     * 
+     *
      * @param type
      *            is the mime-type to add.
      */

Modified: lucene/tika/trunk/tika-core/src/main/resources/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/mime/tika-mimetypes.xml?rev=776859&r1=776858&r2=776859&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/resources/mime/tika-mimetypes.xml (original)
+++ lucene/tika/trunk/tika-core/src/main/resources/mime/tika-mimetypes.xml Wed May 20 21:17:56 2009
@@ -23,7 +23,7 @@
 <mime-info>
 
   <mime-type type="text/plain">
-    <magic priority="50">
+    <magic priority="20">
       <match value="This is TeX," type="string" offset="0" />
       <match value="This is METAFONT," type="string" offset="0" />
       <match value="#!/" type="string" offset="0" />
@@ -118,9 +118,6 @@
     <glob pattern="*.xmap" />
     <glob pattern="*.xroles" />
     <glob pattern="*.xsamples" />
-    <glob pattern="*.xsd" />
-    <glob pattern="*.xsl" />
-    <glob pattern="*.xslt" />
     <glob pattern="*.xsp" />
     <glob pattern="*.xul" />
     <glob pattern="*.xweb" />
@@ -154,6 +151,7 @@
   <mime-type type="application/xhtml+xml">
     <sub-class-of type="application/xml" />
     <glob pattern="*.xhtml" />
+    <glob pattern="*.xht" />
     <root-XML namespaceURI="http://www.w3.org/1999/xhtml" localName="html" />
   </mime-type>
 
@@ -564,11 +562,15 @@
   <mime-type type="application/xml">
     <sub-class-of type="text/plain" />
     <magic priority="50">
-      <match value="\&lt;?xml" type="string" offset="0" />
-      <match value="\&lt;?XML" type="string" offset="0" />
+      <match value="&lt;?xml" type="string" offset="0" />
+      <match value="&lt;?XML" type="string" offset="0" />
+      <match value="0xFFFE3C003F0078006D006C00" type="string" offset="0" />
+      <match value="0xFEFF003C003F0078006D006C" type="string" offset="0" />
+      <!-- TODO: Add matches for the rest of the possible XML encoding schemes -->
     </magic>
     <alias type="text/xml" />
     <glob pattern="*.xml" />
+    <glob pattern="*.xsd" />
   </mime-type>
 
   <mime-type type="image/svg+xml">
@@ -579,6 +581,16 @@
     <glob pattern="*.svg" />
   </mime-type>
 
+  <mime-type type="application/xslt+xml">
+    <sub-class-of type="application/xml" />
+    <acronym>XSLT</acronym>
+    <comment>XSL Transformations</comment>
+    <root-XML localName="stylesheet" namespaceURI="http://www.w3.org/1999/XSL/Transform" />
+    <alias type="text/xsl" />
+    <glob pattern="*.xsl" />
+    <glob pattern="*.xslt" />
+  </mime-type>
+  
   <mime-type type="application/x-mif">
     <magic priority="50">
       <match value="\&lt;MakerFile" type="string" offset="0" />
@@ -669,10 +681,6 @@
     <alias type="application/x-sh" />
   </mime-type>
 
-  <mime-type type="application/xhtml+xml">
-    <glob pattern="*.xht" />
-  </mime-type>
-
   <mime-type type="audio/midi">
     <acronym>MIDI</acronym>
     <comment>Musical Instrument Digital Interface</comment>

Added: lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java (added)
+++ lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java Wed May 20 21:17:56 2009
@@ -0,0 +1,56 @@
+package org.apache.tika.mime;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+
+public class MimeDetectionTest extends TestCase {
+
+    private MimeTypes mimeTypes;
+
+    /** @inheritDoc */
+    @Override
+    protected void setUp() throws Exception {
+        super.setUp();
+        this.mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository();
+        //this.mimeTypes = MimeTypesFactory.create("/org/apache/tika/mime/tika-mimetypes-minimal.xml");
+    }
+
+    public void testDetection() throws Exception {
+        testFile("image/svg+xml", "circles.svg");
+        testFile("image/svg+xml", "circles-with-prefix.svg");
+        testFile("image/png", "datamatrix.png");
+        testFile("text/html", "test.html");
+        testFile("application/xml", "test-iso-8859-1.xml");
+        testFile("application/xml", "test-utf8.xml");
+        testFile("application/xml", "test-utf16le.xml");
+        testFile("application/xml", "test-utf16be.xml");
+        testFile("application/xml", "test-long-comment.xml");
+        testFile("application/xslt+xml", "stylesheet.xsl");
+    }
+
+    private void testFile(String expected, String filename) throws IOException {
+        InputStream in = getClass().getResourceAsStream(filename);
+        assertNotNull("Test file not found: " + filename, in);
+        if (!in.markSupported()) {
+            in = new java.io.BufferedInputStream(in);
+        }
+        try {
+            Metadata metadata = new Metadata();
+            String mime = this.mimeTypes.detect(in, metadata).toString();
+            assertEquals(filename + " is not properly detected.", expected, mime);
+
+            //Add resource name and test again
+            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+            mime = this.mimeTypes.detect(in, metadata).toString();
+            assertEquals(filename + " is not properly detected.", expected, mime);
+        } finally {
+            in.close();
+        }
+    }
+
+}

Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles-with-prefix.svg
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles-with-prefix.svg?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles-with-prefix.svg (added)
+++ lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles-with-prefix.svg Wed May 20 21:17:56 2009
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg:svg xmlns:svg="http://www.w3.org/2000/svg" width="12cm" height="12cm">
+  <svg:g style="fill-opacity:0.7; stroke:black; stroke-width:0.1cm;">
+    <svg:circle cx="6cm" cy="2cm" r="100" style="fill:red;" transform="translate(0,50)" />
+    <svg:circle cx="6cm" cy="2cm" r="100" style="fill:blue;" transform="translate(70,150)" />
+    <svg:circle cx="6cm" cy="2cm" r="100" style="fill:green;" transform="translate(-70,150)"/>
+  </svg:g>
+</svg:svg>
\ No newline at end of file

Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles.svg
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles.svg?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles.svg (added)
+++ lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles.svg Wed May 20 21:17:56 2009
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" width="12cm" height="12cm">
+  <g style="fill-opacity:0.7; stroke:black; stroke-width:0.1cm;">
+    <circle cx="6cm" cy="2cm" r="100" style="fill:red;" transform="translate(0,50)" />
+    <circle cx="6cm" cy="2cm" r="100" style="fill:blue;" transform="translate(70,150)" />
+    <circle cx="6cm" cy="2cm" r="100" style="fill:green;" transform="translate(-70,150)"/>
+  </g>
+</svg>
\ No newline at end of file

Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/datamatrix.png
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/datamatrix.png?rev=776859&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/datamatrix.png
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/stylesheet.xsl
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/stylesheet.xsl?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/stylesheet.xsl (added)
+++ lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/stylesheet.xsl Wed May 20 21:17:56 2009
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="utf-8"?>
+<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+
+  <xsl:output method="xml" indent="yes"/>
+
+  <xsl:template match="/">
+    <test hello="world"/>
+  </xsl:template>
+</xsl:stylesheet>

Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-iso-8859-1.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-iso-8859-1.xml?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-iso-8859-1.xml (added)
+++ lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-iso-8859-1.xml Wed May 20 21:17:56 2009
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<test hello="world"/>
\ No newline at end of file

Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-long-comment.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-long-comment.xml?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-long-comment.xml (added)
+++ lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-long-comment.xml Wed May 20 21:17:56 2009
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<?somepi blahblah test="ignore-me.xml" ?>
+<test hello="world"/>
\ No newline at end of file

Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16be.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16be.xml?rev=776859&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16be.xml
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16le.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16le.xml?rev=776859&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16le.xml
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8.xml?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8.xml (added)
+++ lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8.xml Wed May 20 21:17:56 2009
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<test hello="world"/>
\ No newline at end of file

Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test.html
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test.html?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test.html (added)
+++ lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test.html Wed May 20 21:17:56 2009
@@ -0,0 +1,10 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
+<title>Hello World</title>
+</head>
+<body>
+  <p>Hello World!<p/>
+</body>
+</html>
\ No newline at end of file