You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/03/31 23:50:03 UTC

svn commit: r1087450 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Author: nick
Date: Thu Mar 31 21:50:02 2011
New Revision: 1087450

URL: http://svn.apache.org/viewvc?rev=1087450&view=rev
Log:
TIKA-629 - Add detection for .asf, .wmv and .wma (including tests)
Adds support for unicodeLE and unicodeBE strings in the mimetypes reader

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=1087450&r1=1087449&r2=1087450&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java Thu Mar 31 21:50:02 2011
@@ -16,17 +16,7 @@
  */
 package org.apache.tika.mime;
 
-import org.apache.tika.detect.MagicDetector;
-import org.w3c.dom.Attr;
-import org.w3c.dom.Node;
-import org.w3c.dom.Element;
-import org.w3c.dom.Document;
-import org.w3c.dom.NodeList;
-import org.w3c.dom.NamedNodeMap;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-
-import java.io.ByteArrayOutputStream;
+import java.io.CharArrayWriter;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
@@ -36,6 +26,16 @@ import javax.xml.parsers.DocumentBuilder
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
 
+import org.apache.tika.detect.MagicDetector;
+import org.w3c.dom.Attr;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
 /**
  * A reader for XML files compliant with the freedesktop MIME-info DTD.
  * 
@@ -285,9 +285,9 @@ final class MimeTypesReader implements M
             radix = 8;
         }
 
-        if (type.equals("string")) {
-            decoded = decodeString(value);
-
+        if (type.equals("string") || type.equals("unicodeLE") || type.equals("unicodeBE")) {
+            decoded = decodeString(value, type);
+            
         } else if (type.equals("byte")) {
             decoded = tmpVal.getBytes();
 
@@ -315,18 +315,18 @@ final class MimeTypesReader implements M
         return decoded;
     }
 
-    private byte[] decodeString(String value) throws MimeTypeException {
+    private byte[] decodeString(String value, String type) throws MimeTypeException {
         if (value.startsWith("0x")) {
-            byte[] bytes = new byte[(value.length() - 2) / 2];
-            for (int i = 0; i < bytes.length; i++) {
-                bytes[i] = (byte)
+            byte[] vals = new byte[(value.length() - 2) / 2];
+            for (int i = 0; i < vals.length; i++) {
+                vals[i] = (byte)
                 Integer.parseInt(value.substring(2 + i * 2, 4 + i * 2), 16);
             }
-            return bytes;
+            return vals;
         }
 
         try {
-            ByteArrayOutputStream decoded = new ByteArrayOutputStream();
+            CharArrayWriter decoded = new CharArrayWriter();
 
             for (int i = 0; i < value.length(); i++) {
                 if (value.charAt(i) == '\\') {
@@ -351,7 +351,33 @@ final class MimeTypesReader implements M
                     decoded.write(value.charAt(i));
                 }
             }
-            return decoded.toByteArray();
+            
+            // Now turn the chars into bytes
+            char[] chars = decoded.toCharArray();
+            byte[] bytes;
+            if("unicodeLE".equals(type)) {
+               bytes = new byte[chars.length*2];
+               for(int i=0; i<chars.length; i++) {
+                  bytes[i*2] = (byte)(chars[i] & 0xff);
+                  bytes[i*2+1] = (byte)(chars[i] >> 8);
+               }
+            }
+            else if("unicodeBE".equals(type)) {
+               bytes = new byte[chars.length*2];
+               for(int i=0; i<chars.length; i++) {
+                  bytes[i*2] = (byte)(chars[i] >> 8);
+                  bytes[i*2+1] = (byte)(chars[i] & 0xff);
+               }
+            }
+            else {
+               // Copy with truncation
+               bytes = new byte[chars.length];
+               for(int i=0; i<bytes.length; i++) {
+                  bytes[i] = (byte)chars[i];
+               }
+            }
+            
+            return bytes;
         } catch (NumberFormatException e) {
             throw new MimeTypeException("Invalid string value: " + value, e);
         }

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1087450&r1=1087449&r2=1087450&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Thu Mar 31 21:50:02 2011
@@ -3081,7 +3081,11 @@
     <glob pattern="*.wax"/>
   </mime-type>
   <mime-type type="audio/x-ms-wma">
+    <sub-class-of type="video/x-ms-asf" />
     <glob pattern="*.wma"/>
+    <magic priority="50">
+       <match value="Windows Media Audio" type="unicodeLE" offset="0:8192" />
+    </magic>
   </mime-type>
 
   <mime-type type="audio/x-pn-realaudio">
@@ -4109,12 +4113,19 @@
   <mime-type type="video/x-ms-asf">
     <glob pattern="*.asf"/>
     <glob pattern="*.asx"/>
+    <magic>
+       <match value="0x3026b275" type="big32" offset="0" />
+    </magic>
   </mime-type>
   <mime-type type="video/x-ms-wm">
     <glob pattern="*.wm"/>
   </mime-type>
   <mime-type type="video/x-ms-wmv">
+    <sub-class-of type="video/x-ms-asf" />
     <glob pattern="*.wmv"/>
+    <magic priority="60">
+       <match value="Windows Media Video" type="unicodeLE" offset="0:8192" />
+    </magic>
   </mime-type>
   <mime-type type="video/x-ms-wmx">
     <glob pattern="*.wmx"/>

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=1087450&r1=1087449&r2=1087450&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Thu Mar 31 21:50:02 2011
@@ -334,6 +334,16 @@ public class TestMimeTypes extends TestC
         assertTypeByName("application/postscript", "x.epsf");
         assertTypeByName("application/postscript", "x.epsi");
     }
+    
+    public void testMicrosoftMultiMedia() throws Exception {
+       assertTypeByName("video/x-ms-asf", "x.asf");
+       assertTypeByName("video/x-ms-wmv", "x.wmv");
+       assertTypeByName("audio/x-ms-wma", "x.wma");
+       
+       assertTypeByData("video/x-ms-asf", "testASF.asf");
+       assertTypeByData("video/x-ms-wmv", "testWMV.wmv");
+       assertTypeByData("audio/x-ms-wma", "testWMA.wma");
+    }
 
     /**
      * @since TIKA-194
@@ -439,6 +449,7 @@ public class TestMimeTypes extends TestC
             throws IOException {
         InputStream stream = TestMimeTypes.class.getResourceAsStream(
                 "/test-documents/" + filename);
+        assertNotNull("Test file not found: " + filename, stream);
         try {
             Metadata metadata = new Metadata();
             assertEquals(expected, repo.detect(stream, metadata).toString());