You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/05/20 23:17:56 UTC
svn commit: r776859 - in /lucene/tika/trunk/tika-core/src:
main/java/org/apache/tika/detect/ main/java/org/apache/tika/mime/
main/resources/mime/ test/java/org/apache/tika/mime/ test/resources/org/
test/resources/org/apache/ test/resources/org/apache/t...
Author: jukka
Date: Wed May 20 21:17:56 2009
New Revision: 776859
URL: http://svn.apache.org/viewvc?rev=776859&view=rev
Log:
TIKA-225: [PATCH] Various bugfixes for MIME detection
Applied the patch contributed by Jeremias Maerki
Added:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
lucene/tika/trunk/tika-core/src/test/resources/org/
lucene/tika/trunk/tika-core/src/test/resources/org/apache/
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles-with-prefix.svg
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles.svg
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/datamatrix.png (with props)
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/stylesheet.xsl
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-iso-8859-1.xml
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-long-comment.xml
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16be.xml (with props)
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16le.xml (with props)
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8.xml
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test.html
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
lucene/tika/trunk/tika-core/src/main/resources/mime/tika-mimetypes.xml
Added: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java (added)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java Wed May 20 21:17:56 2009
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.InputStream;
+
+import javax.xml.namespace.QName;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Utility class that uses a {@link SAXParser} to determine the namespace URI and local name of
+ * the root element of an XML file.
+ *
+ * @since Apache Tika 0.4
+ */
+public class XmlRootExtractor {
+
+ public static QName extractRootElement(byte[] data) {
+ SAXParserFactory parserFactory = SAXParserFactory.newInstance();
+ parserFactory.setNamespaceAware(true);
+ parserFactory.setValidating(false);
+
+ ExtractorHandler handler = new ExtractorHandler();
+ try {
+ SAXParser parser = parserFactory.newSAXParser();
+ InputStream in = new java.io.ByteArrayInputStream(data);
+ parser.parse(in, handler);
+ } catch (Exception e) {
+ //ignore
+ }
+ return handler.rootElement;
+ }
+
+ private static class ExtractorHandler extends DefaultHandler {
+
+ private QName rootElement;
+
+ /** @inheritDoc */
+ @Override
+ public void startElement(String uri, String localName, String name, Attributes attributes)
+ throws SAXException {
+ this.rootElement = new QName(uri, localName);
+ throw new SAXException("Aborting: root element received");
+ }
+
+ }
+
+}
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java?rev=776859&r1=776858&r2=776859&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java Wed May 20 21:17:56 2009
@@ -131,7 +131,7 @@
/**
* Returns the name of this media type.
- *
+ *
* @return media type name (lower case)
*/
public String getName() {
@@ -189,13 +189,13 @@
return true;
}
}
- return false;
+ return false;
}
}
/**
* Returns the description of this media type.
- *
+ *
* @return media type description
*/
public String getDescription() {
@@ -204,7 +204,7 @@
/**
* Set the description of this media type.
- *
+ *
* @param description media type description
*/
public void setDescription(String description) {
@@ -245,7 +245,7 @@
/**
* Add some rootXML info to this mime-type
- *
+ *
* @param namespaceURI
* @param localName
*/
@@ -265,6 +265,15 @@
return false;
}
+ boolean matchesXML(String namespaceURI, String localName) {
+ for (RootXML xml : rootXML) {
+ if (xml.matches(namespaceURI, localName)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
boolean hasRootXML() {
return (rootXML.size() > 0);
}
@@ -353,6 +362,23 @@
return pattern.matcher(data).matches();
}
+ boolean matches(String namespaceURI, String localName) {
+ //Compare namespaces
+ if (!(StringUtil.isEmpty(this.namespaceURI))) {
+ if (!this.namespaceURI.equals(namespaceURI)) {
+ return false;
+ }
+ }
+
+ //Compare root element's local name
+ if (!StringUtil.isEmpty(this.localName)) {
+ if (!this.localName.equals(localName)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
MimeType getType() {
return type;
}
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=776859&r1=776858&r2=776859&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java Wed May 20 21:17:56 2009
@@ -23,12 +23,15 @@
import java.io.InputStream;
import java.net.URL;
import java.util.Arrays;
-import java.util.Map;
import java.util.HashMap;
+import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
+import javax.xml.namespace.QName;
+
import org.apache.tika.detect.Detector;
+import org.apache.tika.detect.XmlRootExtractor;
import org.apache.tika.metadata.Metadata;
/**
@@ -127,7 +130,7 @@
/**
* Find the Mime Content Type of a file.
- *
+ *
* @param file
* to analyze.
* @return the Mime Content Type of the specified file, or <code>null</code>
@@ -139,7 +142,7 @@
/**
* Find the Mime Content Type of a document from its URL.
- *
+ *
* @param url
* of the document to analyze.
* @return the Mime Content Type of the specified document URL, or
@@ -152,7 +155,7 @@
/**
* Find the Mime Content Type of a document from its name.
* Returns application/octet-stream if no better match is found.
- *
+ *
* @param name of the document to analyze.
* @return the Mime Content Type of the specified document name
*/
@@ -186,6 +189,7 @@
}
// First, check for XML descriptions (level by level)
+ // Problem: Regexp matching doesn't work for all XML encodings
for (MimeType type : xmls) {
if (type.matchesXML(data)) {
return type;
@@ -193,12 +197,32 @@
}
// Then, check for magic bytes
+ MimeType result = null;
for (Magic magic : magics) {
if (magic.eval(data)) {
- return magic.getType();
+ result = magic.getType();
+ break;
+ }
+ }
+ if (result != null) {
+ // When detecting generic XML, parse XML to determine the root element
+ if ("application/xml".equals(result.getName())) {
+ QName rootElement = XmlRootExtractor.extractRootElement(data);
+ if (rootElement != null) {
+ for (MimeType type : xmls) {
+ if (type.matchesXML(
+ rootElement.getNamespaceURI(),
+ rootElement.getLocalPart())) {
+ result = type;
+ break;
+ }
+ }
+ }
}
+ return result;
}
+
// Finally, assume plain text if no control bytes are found
for (int i = 0; i < data.length; i++) {
int b = data[i] & 0xFF; // prevent sign extension
@@ -302,7 +326,7 @@
* <li>If a type is found, then return it, otherwise try to find the type
* based on the file name</li>
* </ol>
- *
+ *
* @param name
* of the document to analyze.
* @param data
@@ -389,7 +413,7 @@
/**
* Adds a file name pattern for the given media type. Assumes that the
* pattern being added is <b>not</b> a JDK standard regular expression.
- *
+ *
* @param type
* media type
* @param pattern
@@ -408,7 +432,7 @@
* regular expression via the <code>isRegex</code> parameter. If the value
* is set to true, then a JDK standard regex is assumed, otherwise the
* freedesktop glob type is assumed.
- *
+ *
* @param type
* media type
* @param pattern
@@ -418,7 +442,7 @@
* false.
* @throws MimeTypeException
* if the pattern conflicts with existing ones.
- *
+ *
*/
public void addPattern(MimeType type, String pattern, boolean isRegex)
throws MimeTypeException {
@@ -428,7 +452,7 @@
/**
* Return the minimum length of data to provide to analyzing methods based
* on the document's content in order to check all the known MimeTypes.
- *
+ *
* @return the minimum length of data to provide.
* @see #getMimeType(byte[])
* @see #getMimeType(String, byte[])
@@ -440,7 +464,7 @@
/**
* Add the specified mime-type in the repository.
- *
+ *
* @param type
* is the mime-type to add.
*/
Modified: lucene/tika/trunk/tika-core/src/main/resources/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/mime/tika-mimetypes.xml?rev=776859&r1=776858&r2=776859&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/resources/mime/tika-mimetypes.xml (original)
+++ lucene/tika/trunk/tika-core/src/main/resources/mime/tika-mimetypes.xml Wed May 20 21:17:56 2009
@@ -23,7 +23,7 @@
<mime-info>
<mime-type type="text/plain">
- <magic priority="50">
+ <magic priority="20">
<match value="This is TeX," type="string" offset="0" />
<match value="This is METAFONT," type="string" offset="0" />
<match value="#!/" type="string" offset="0" />
@@ -118,9 +118,6 @@
<glob pattern="*.xmap" />
<glob pattern="*.xroles" />
<glob pattern="*.xsamples" />
- <glob pattern="*.xsd" />
- <glob pattern="*.xsl" />
- <glob pattern="*.xslt" />
<glob pattern="*.xsp" />
<glob pattern="*.xul" />
<glob pattern="*.xweb" />
@@ -154,6 +151,7 @@
<mime-type type="application/xhtml+xml">
<sub-class-of type="application/xml" />
<glob pattern="*.xhtml" />
+ <glob pattern="*.xht" />
<root-XML namespaceURI="http://www.w3.org/1999/xhtml" localName="html" />
</mime-type>
@@ -564,11 +562,15 @@
<mime-type type="application/xml">
<sub-class-of type="text/plain" />
<magic priority="50">
- <match value="\<?xml" type="string" offset="0" />
- <match value="\<?XML" type="string" offset="0" />
+ <match value="<?xml" type="string" offset="0" />
+ <match value="<?XML" type="string" offset="0" />
+ <match value="0xFFFE3C003F0078006D006C00" type="string" offset="0" />
+ <match value="0xFEFF003C003F0078006D006C" type="string" offset="0" />
+ <!-- TODO: Add matches for the rest of the possible XML encoding schemes -->
</magic>
<alias type="text/xml" />
<glob pattern="*.xml" />
+ <glob pattern="*.xsd" />
</mime-type>
<mime-type type="image/svg+xml">
@@ -579,6 +581,16 @@
<glob pattern="*.svg" />
</mime-type>
+ <mime-type type="application/xslt+xml">
+ <sub-class-of type="application/xml" />
+ <acronym>XSLT</acronym>
+ <comment>XSL Transformations</comment>
+ <root-XML localName="stylesheet" namespaceURI="http://www.w3.org/1999/XSL/Transform" />
+ <alias type="text/xsl" />
+ <glob pattern="*.xsl" />
+ <glob pattern="*.xslt" />
+ </mime-type>
+
<mime-type type="application/x-mif">
<magic priority="50">
<match value="\<MakerFile" type="string" offset="0" />
@@ -669,10 +681,6 @@
<alias type="application/x-sh" />
</mime-type>
- <mime-type type="application/xhtml+xml">
- <glob pattern="*.xht" />
- </mime-type>
-
<mime-type type="audio/midi">
<acronym>MIDI</acronym>
<comment>Musical Instrument Digital Interface</comment>
Added: lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java (added)
+++ lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java Wed May 20 21:17:56 2009
@@ -0,0 +1,56 @@
+package org.apache.tika.mime;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+
+public class MimeDetectionTest extends TestCase {
+
+ private MimeTypes mimeTypes;
+
+ /** @inheritDoc */
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+ this.mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository();
+ //this.mimeTypes = MimeTypesFactory.create("/org/apache/tika/mime/tika-mimetypes-minimal.xml");
+ }
+
+ public void testDetection() throws Exception {
+ testFile("image/svg+xml", "circles.svg");
+ testFile("image/svg+xml", "circles-with-prefix.svg");
+ testFile("image/png", "datamatrix.png");
+ testFile("text/html", "test.html");
+ testFile("application/xml", "test-iso-8859-1.xml");
+ testFile("application/xml", "test-utf8.xml");
+ testFile("application/xml", "test-utf16le.xml");
+ testFile("application/xml", "test-utf16be.xml");
+ testFile("application/xml", "test-long-comment.xml");
+ testFile("application/xslt+xml", "stylesheet.xsl");
+ }
+
+ private void testFile(String expected, String filename) throws IOException {
+ InputStream in = getClass().getResourceAsStream(filename);
+ assertNotNull("Test file not found: " + filename, in);
+ if (!in.markSupported()) {
+ in = new java.io.BufferedInputStream(in);
+ }
+ try {
+ Metadata metadata = new Metadata();
+ String mime = this.mimeTypes.detect(in, metadata).toString();
+ assertEquals(filename + " is not properly detected.", expected, mime);
+
+ //Add resource name and test again
+ metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+ mime = this.mimeTypes.detect(in, metadata).toString();
+ assertEquals(filename + " is not properly detected.", expected, mime);
+ } finally {
+ in.close();
+ }
+ }
+
+}
Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles-with-prefix.svg
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles-with-prefix.svg?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles-with-prefix.svg (added)
+++ lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles-with-prefix.svg Wed May 20 21:17:56 2009
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg:svg xmlns:svg="http://www.w3.org/2000/svg" width="12cm" height="12cm">
+ <svg:g style="fill-opacity:0.7; stroke:black; stroke-width:0.1cm;">
+ <svg:circle cx="6cm" cy="2cm" r="100" style="fill:red;" transform="translate(0,50)" />
+ <svg:circle cx="6cm" cy="2cm" r="100" style="fill:blue;" transform="translate(70,150)" />
+ <svg:circle cx="6cm" cy="2cm" r="100" style="fill:green;" transform="translate(-70,150)"/>
+ </svg:g>
+</svg:svg>
\ No newline at end of file
Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles.svg
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles.svg?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles.svg (added)
+++ lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/circles.svg Wed May 20 21:17:56 2009
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" width="12cm" height="12cm">
+ <g style="fill-opacity:0.7; stroke:black; stroke-width:0.1cm;">
+ <circle cx="6cm" cy="2cm" r="100" style="fill:red;" transform="translate(0,50)" />
+ <circle cx="6cm" cy="2cm" r="100" style="fill:blue;" transform="translate(70,150)" />
+ <circle cx="6cm" cy="2cm" r="100" style="fill:green;" transform="translate(-70,150)"/>
+ </g>
+</svg>
\ No newline at end of file
Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/datamatrix.png
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/datamatrix.png?rev=776859&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/datamatrix.png
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/stylesheet.xsl
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/stylesheet.xsl?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/stylesheet.xsl (added)
+++ lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/stylesheet.xsl Wed May 20 21:17:56 2009
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="utf-8"?>
+<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+
+ <xsl:output method="xml" indent="yes"/>
+
+ <xsl:template match="/">
+ <test hello="world"/>
+ </xsl:template>
+</xsl:stylesheet>
Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-iso-8859-1.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-iso-8859-1.xml?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-iso-8859-1.xml (added)
+++ lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-iso-8859-1.xml Wed May 20 21:17:56 2009
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<test hello="world"/>
\ No newline at end of file
Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-long-comment.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-long-comment.xml?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-long-comment.xml (added)
+++ lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-long-comment.xml Wed May 20 21:17:56 2009
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<?somepi blahblah test="ignore-me.xml" ?>
+<test hello="world"/>
\ No newline at end of file
Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16be.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16be.xml?rev=776859&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16be.xml
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16le.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16le.xml?rev=776859&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf16le.xml
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8.xml?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8.xml (added)
+++ lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test-utf8.xml Wed May 20 21:17:56 2009
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<test hello="world"/>
\ No newline at end of file
Added: lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test.html
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test.html?rev=776859&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test.html (added)
+++ lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/mime/test.html Wed May 20 21:17:56 2009
@@ -0,0 +1,10 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
+<title>Hello World</title>
+</head>
+<body>
+ <p>Hello World!<p/>
+</body>
+</html>
\ No newline at end of file