You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/10 21:25:58 UTC

svn commit: r995951 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/apache/tik...

Author: nick
Date: Fri Sep 10 19:25:57 2010
New Revision: 995951

URL: http://svn.apache.org/viewvc?rev=995951&view=rev
Log:
Make the emf/wmf mimetypes returned for the OLE2 office files match that stored in the OOXML files, as well as refactoring the container tests to reduce duplication (TIKA-509)

Added:
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=995951&r1=995950&r2=995951&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Fri Sep 10 19:25:57 2010
@@ -2488,6 +2488,8 @@
     <glob pattern="*.m14"/>
   </mime-type>
   <mime-type type="application/x-msmetafile">
+    <alias type="image/x-emf"/>
+    <alias type="image/x-wmf"/>
     <acronym>WMF</acronym>
     <comment>Windows Metafile</comment>
     <glob pattern="*.wmf"/>

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=995951&r1=995950&r2=995951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Fri Sep 10 19:25:57 2010
@@ -539,10 +539,10 @@ public class ExcelExtractor extends Abst
                     String mimeType = "";
                     switch (blip.getRecordId()) {
                     case EscherMetafileBlip.RECORD_ID_WMF:
-                       mimeType =  "application/x-wmf";
+                       mimeType =  "image/x-wmf";
                        break;
                     case EscherMetafileBlip.RECORD_ID_EMF:
-                       mimeType =  "application/x-emf";
+                       mimeType =  "image/x-emf";
                        break;
                     case EscherMetafileBlip.RECORD_ID_PICT:
                        mimeType =  "image/x-pict";

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=995951&r1=995950&r2=995951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Fri Sep 10 19:25:57 2010
@@ -92,10 +92,10 @@ public class WordExtractor extends Abstr
                  mimeType =  "image/tiff";
               }
               if("wmf".equals(extension)) {
-                 mimeType =  "application/x-wmf";
+                 mimeType =  "image/x-wmf";
               }
               if("emf".equals(extension)) {
-                 mimeType =  "application/x-emf";
+                 mimeType =  "image/x-emf";
               }
               
               TikaInputStream stream = TikaInputStream.get(picture.getContent());

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java?rev=995951&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java Fri Sep 10 19:25:57 2010
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.EmbeddedResourceHandler;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Parent class of tests that the various POI powered parsers are
+ *  able to extract their embedded contents.
+ */
+public abstract class AbstractPOIContainerExtractionTest extends TestCase {
+    public static final MediaType TYPE_DOC = MediaType.application("msword");
+    public static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint");
+    public static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel");
+    public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+    public static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
+    public static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+    
+    public static final MediaType TYPE_JPG = MediaType.image("jpeg");
+    public static final MediaType TYPE_PNG = MediaType.image("png");
+    public static final MediaType TYPE_EMF = MediaType.image("x-emf");
+    
+    protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
+        InputStream input = AbstractPOIContainerExtractionTest.class.getResourceAsStream(
+             "/test-documents/" + filename);
+        assertNotNull(filename + " not found", input);
+        
+        TikaInputStream stream = TikaInputStream.get(input);
+        assertNotNull(stream);
+        
+        assertEquals(true, extractor.isSupported(stream));
+        
+        // Process it
+        TrackingHandler handler = new TrackingHandler();
+        if(recurse) {
+           extractor.extract(stream, extractor, handler);
+        } else {
+           extractor.extract(stream, null, handler);
+        }
+        
+        // So they can check what happened
+        return handler;
+    }
+    
+    protected static class TrackingHandler implements EmbeddedResourceHandler {
+       public List<String> filenames = new ArrayList<String>();
+       public List<MediaType> mediaTypes = new ArrayList<MediaType>();
+       
+       public void handle(String filename, MediaType mediaType,
+            InputStream stream) {
+          filenames.add(filename);
+          mediaTypes.add(mediaType);
+      }
+    }
+}

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=995951&r1=995950&r2=995951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Fri Sep 10 19:25:57 2010
@@ -16,33 +16,15 @@
  */
 package org.apache.tika.parser.microsoft;
 
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-import junit.framework.TestCase;
-
 import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.EmbeddedResourceHandler;
 import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.mime.MediaType;
 
 /**
  * Tests that the various POI powered parsers are
  *  able to extract their embedded contents.
  */
-public class POIContainerExtractionTest extends TestCase {
-    private static final MediaType TYPE_DOC = MediaType.application("msword");
-    private static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint");
-    private static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel");
-    private static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
-    private static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
-    private static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
-    
-    private static final MediaType TYPE_JPG = MediaType.image("jpeg");
-    private static final MediaType TYPE_PNG = MediaType.image("png");
-    private static final MediaType TYPE_EMF = MediaType.application("x-emf");
+public class POIContainerExtractionTest extends AbstractPOIContainerExtractionTest {
    
     /**
      * For office files which don't have anything embedded in them
@@ -222,35 +204,4 @@ public class POIContainerExtractionTest 
        // Outlook with a pdf and another outlook message
        // TODO
     }
-    
-    private TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
-        InputStream input = POIContainerExtractionTest.class.getResourceAsStream(
-             "/test-documents/" + filename);
-        assertNotNull(filename + " not found", input);
-        TikaInputStream stream = TikaInputStream.get(input);
-        
-        assertEquals(true, extractor.isSupported(stream));
-        
-        // Process it
-        TrackingHandler handler = new TrackingHandler();
-        if(recurse) {
-           extractor.extract(stream, extractor, handler);
-        } else {
-           extractor.extract(stream, null, handler);
-        }
-        
-        // So they can check what happened
-        return handler;
-    }
-    
-    private static class TrackingHandler implements EmbeddedResourceHandler {
-       private List<String> filenames = new ArrayList<String>();
-       private List<MediaType> mediaTypes = new ArrayList<MediaType>();
-       
-       public void handle(String filename, MediaType mediaType,
-            InputStream stream) {
-          filenames.add(filename);
-          mediaTypes.add(mediaType);
-      }
-    }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java?rev=995951&r1=995950&r2=995951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java Fri Sep 10 19:25:57 2010
@@ -16,37 +16,18 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-import junit.framework.TestCase;
-
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.ContainerAwareDetector;
 import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.EmbeddedResourceHandler;
 import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest;
 
 /**
  * Tests that the various POI OOXML powered parsers are
  *  able to extract their embedded contents.
  */
-public class OOXMLContainerExtractionTest extends TestCase {
-    private static final MediaType TYPE_DOC = MediaType.application("msword");
-    private static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint");
-    private static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel");
-    private static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
-    private static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
-    private static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
-    
-    private static final MediaType TYPE_JPG = MediaType.image("jpeg");
-    private static final MediaType TYPE_PNG = MediaType.image("png");
-    private static final MediaType TYPE_EMF = MediaType.image("x-emf");
-    
+public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtractionTest {
     private ContainerExtractor extractor;
     
     @Override
@@ -220,46 +201,5 @@ public class OOXMLContainerExtractionTes
        
        // PowerPoint with excel and word
        // TODO
-       
-       
-       // Outlook with a text file and a word document
-       // TODO
-       
-       
-       // Outlook with a pdf and another outlook message
-       // TODO
-    }
-    
-    private TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
-        InputStream input = OOXMLContainerExtractionTest.class.getResourceAsStream(
-             "/test-documents/" + filename);
-        assertNotNull(filename + " not found", input);
-        
-        TikaInputStream stream = TikaInputStream.get(input);
-        assertNotNull(stream);
-        
-        assertEquals(true, extractor.isSupported(stream));
-        
-        // Process it
-        TrackingHandler handler = new TrackingHandler();
-        if(recurse) {
-           extractor.extract(stream, extractor, handler);
-        } else {
-           extractor.extract(stream, null, handler);
-        }
-        
-        // So they can check what happened
-        return handler;
-    }
-    
-    private static class TrackingHandler implements EmbeddedResourceHandler {
-       private List<String> filenames = new ArrayList<String>();
-       private List<MediaType> mediaTypes = new ArrayList<MediaType>();
-       
-       public void handle(String filename, MediaType mediaType,
-            InputStream stream) {
-          filenames.add(filename);
-          mediaTypes.add(mediaType);
-      }
     }
 }