You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/03/31 23:50:03 UTC
svn commit: r1087450 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Author: nick
Date: Thu Mar 31 21:50:02 2011
New Revision: 1087450
URL: http://svn.apache.org/viewvc?rev=1087450&view=rev
Log:
TIKA-629 - Add detection for .asf, .wmv and .wma (including tests)
Adds support for unicodeLE and unicodeBE strings in the mimetypes reader
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=1087450&r1=1087449&r2=1087450&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java Thu Mar 31 21:50:02 2011
@@ -16,17 +16,7 @@
*/
package org.apache.tika.mime;
-import org.apache.tika.detect.MagicDetector;
-import org.w3c.dom.Attr;
-import org.w3c.dom.Node;
-import org.w3c.dom.Element;
-import org.w3c.dom.Document;
-import org.w3c.dom.NodeList;
-import org.w3c.dom.NamedNodeMap;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-
-import java.io.ByteArrayOutputStream;
+import java.io.CharArrayWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
@@ -36,6 +26,16 @@ import javax.xml.parsers.DocumentBuilder
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
+import org.apache.tika.detect.MagicDetector;
+import org.w3c.dom.Attr;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
/**
* A reader for XML files compliant with the freedesktop MIME-info DTD.
*
@@ -285,9 +285,9 @@ final class MimeTypesReader implements M
radix = 8;
}
- if (type.equals("string")) {
- decoded = decodeString(value);
-
+ if (type.equals("string") || type.equals("unicodeLE") || type.equals("unicodeBE")) {
+ decoded = decodeString(value, type);
+
} else if (type.equals("byte")) {
decoded = tmpVal.getBytes();
@@ -315,18 +315,18 @@ final class MimeTypesReader implements M
return decoded;
}
- private byte[] decodeString(String value) throws MimeTypeException {
+ private byte[] decodeString(String value, String type) throws MimeTypeException {
if (value.startsWith("0x")) {
- byte[] bytes = new byte[(value.length() - 2) / 2];
- for (int i = 0; i < bytes.length; i++) {
- bytes[i] = (byte)
+ byte[] vals = new byte[(value.length() - 2) / 2];
+ for (int i = 0; i < vals.length; i++) {
+ vals[i] = (byte)
Integer.parseInt(value.substring(2 + i * 2, 4 + i * 2), 16);
}
- return bytes;
+ return vals;
}
try {
- ByteArrayOutputStream decoded = new ByteArrayOutputStream();
+ CharArrayWriter decoded = new CharArrayWriter();
for (int i = 0; i < value.length(); i++) {
if (value.charAt(i) == '\\') {
@@ -351,7 +351,33 @@ final class MimeTypesReader implements M
decoded.write(value.charAt(i));
}
}
- return decoded.toByteArray();
+
+ // Now turn the chars into bytes
+ char[] chars = decoded.toCharArray();
+ byte[] bytes;
+ if("unicodeLE".equals(type)) {
+ bytes = new byte[chars.length*2];
+ for(int i=0; i<chars.length; i++) {
+ bytes[i*2] = (byte)(chars[i] & 0xff);
+ bytes[i*2+1] = (byte)(chars[i] >> 8);
+ }
+ }
+ else if("unicodeBE".equals(type)) {
+ bytes = new byte[chars.length*2];
+ for(int i=0; i<chars.length; i++) {
+ bytes[i*2] = (byte)(chars[i] >> 8);
+ bytes[i*2+1] = (byte)(chars[i] & 0xff);
+ }
+ }
+ else {
+ // Copy with truncation
+ bytes = new byte[chars.length];
+ for(int i=0; i<bytes.length; i++) {
+ bytes[i] = (byte)chars[i];
+ }
+ }
+
+ return bytes;
} catch (NumberFormatException e) {
throw new MimeTypeException("Invalid string value: " + value, e);
}
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1087450&r1=1087449&r2=1087450&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Thu Mar 31 21:50:02 2011
@@ -3081,7 +3081,11 @@
<glob pattern="*.wax"/>
</mime-type>
<mime-type type="audio/x-ms-wma">
+ <sub-class-of type="video/x-ms-asf" />
<glob pattern="*.wma"/>
+ <magic priority="50">
+ <match value="Windows Media Audio" type="unicodeLE" offset="0:8192" />
+ </magic>
</mime-type>
<mime-type type="audio/x-pn-realaudio">
@@ -4109,12 +4113,19 @@
<mime-type type="video/x-ms-asf">
<glob pattern="*.asf"/>
<glob pattern="*.asx"/>
+ <magic>
+ <match value="0x3026b275" type="big32" offset="0" />
+ </magic>
</mime-type>
<mime-type type="video/x-ms-wm">
<glob pattern="*.wm"/>
</mime-type>
<mime-type type="video/x-ms-wmv">
+ <sub-class-of type="video/x-ms-asf" />
<glob pattern="*.wmv"/>
+ <magic priority="60">
+ <match value="Windows Media Video" type="unicodeLE" offset="0:8192" />
+ </magic>
</mime-type>
<mime-type type="video/x-ms-wmx">
<glob pattern="*.wmx"/>
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1087450&r1=1087449&r2=1087450&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Thu Mar 31 21:50:02 2011
@@ -334,6 +334,16 @@ public class TestMimeTypes extends TestC
assertTypeByName("application/postscript", "x.epsf");
assertTypeByName("application/postscript", "x.epsi");
}
+
+ public void testMicrosoftMultiMedia() throws Exception {
+ assertTypeByName("video/x-ms-asf", "x.asf");
+ assertTypeByName("video/x-ms-wmv", "x.wmv");
+ assertTypeByName("audio/x-ms-wma", "x.wma");
+
+ assertTypeByData("video/x-ms-asf", "testASF.asf");
+ assertTypeByData("video/x-ms-wmv", "testWMV.wmv");
+ assertTypeByData("audio/x-ms-wma", "testWMA.wma");
+ }
/**
* @since TIKA-194
@@ -439,6 +449,7 @@ public class TestMimeTypes extends TestC
throws IOException {
InputStream stream = TestMimeTypes.class.getResourceAsStream(
"/test-documents/" + filename);
+ assertNotNull("Test file not found: " + filename, stream);
try {
Metadata metadata = new Metadata();
assertEquals(expected, repo.detect(stream, metadata).toString());