You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/10 21:25:58 UTC
svn commit: r995951 - in /tika/trunk:
tika-core/src/main/resources/org/apache/tika/mime/
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/java/org/apache/tik...
Author: nick
Date: Fri Sep 10 19:25:57 2010
New Revision: 995951
URL: http://svn.apache.org/viewvc?rev=995951&view=rev
Log:
Make the emf/wmf mimetypes returned for the OLE2 office files match that stored in the OOXML files, as well as refactoring the container tests to reduce duplication (TIKA-509)
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=995951&r1=995950&r2=995951&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Fri Sep 10 19:25:57 2010
@@ -2488,6 +2488,8 @@
<glob pattern="*.m14"/>
</mime-type>
<mime-type type="application/x-msmetafile">
+ <alias type="image/x-emf"/>
+ <alias type="image/x-wmf"/>
<acronym>WMF</acronym>
<comment>Windows Metafile</comment>
<glob pattern="*.wmf"/>
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=995951&r1=995950&r2=995951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Fri Sep 10 19:25:57 2010
@@ -539,10 +539,10 @@ public class ExcelExtractor extends Abst
String mimeType = "";
switch (blip.getRecordId()) {
case EscherMetafileBlip.RECORD_ID_WMF:
- mimeType = "application/x-wmf";
+ mimeType = "image/x-wmf";
break;
case EscherMetafileBlip.RECORD_ID_EMF:
- mimeType = "application/x-emf";
+ mimeType = "image/x-emf";
break;
case EscherMetafileBlip.RECORD_ID_PICT:
mimeType = "image/x-pict";
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=995951&r1=995950&r2=995951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Fri Sep 10 19:25:57 2010
@@ -92,10 +92,10 @@ public class WordExtractor extends Abstr
mimeType = "image/tiff";
}
if("wmf".equals(extension)) {
- mimeType = "application/x-wmf";
+ mimeType = "image/x-wmf";
}
if("emf".equals(extension)) {
- mimeType = "application/x-emf";
+ mimeType = "image/x-emf";
}
TikaInputStream stream = TikaInputStream.get(picture.getContent());
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java?rev=995951&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java Fri Sep 10 19:25:57 2010
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.EmbeddedResourceHandler;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Parent class of tests that the various POI powered parsers are
+ * able to extract their embedded contents.
+ */
+public abstract class AbstractPOIContainerExtractionTest extends TestCase {
+ public static final MediaType TYPE_DOC = MediaType.application("msword");
+ public static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint");
+ public static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel");
+ public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+ public static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
+ public static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+
+ public static final MediaType TYPE_JPG = MediaType.image("jpeg");
+ public static final MediaType TYPE_PNG = MediaType.image("png");
+ public static final MediaType TYPE_EMF = MediaType.image("x-emf");
+
+ protected TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
+ InputStream input = AbstractPOIContainerExtractionTest.class.getResourceAsStream(
+ "/test-documents/" + filename);
+ assertNotNull(filename + " not found", input);
+
+ TikaInputStream stream = TikaInputStream.get(input);
+ assertNotNull(stream);
+
+ assertEquals(true, extractor.isSupported(stream));
+
+ // Process it
+ TrackingHandler handler = new TrackingHandler();
+ if(recurse) {
+ extractor.extract(stream, extractor, handler);
+ } else {
+ extractor.extract(stream, null, handler);
+ }
+
+ // So they can check what happened
+ return handler;
+ }
+
+ protected static class TrackingHandler implements EmbeddedResourceHandler {
+ public List<String> filenames = new ArrayList<String>();
+ public List<MediaType> mediaTypes = new ArrayList<MediaType>();
+
+ public void handle(String filename, MediaType mediaType,
+ InputStream stream) {
+ filenames.add(filename);
+ mediaTypes.add(mediaType);
+ }
+ }
+}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=995951&r1=995950&r2=995951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Fri Sep 10 19:25:57 2010
@@ -16,33 +16,15 @@
*/
package org.apache.tika.parser.microsoft;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-import junit.framework.TestCase;
-
import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MediaType;
/**
* Tests that the various POI powered parsers are
* able to extract their embedded contents.
*/
-public class POIContainerExtractionTest extends TestCase {
- private static final MediaType TYPE_DOC = MediaType.application("msword");
- private static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint");
- private static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel");
- private static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
- private static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
- private static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
-
- private static final MediaType TYPE_JPG = MediaType.image("jpeg");
- private static final MediaType TYPE_PNG = MediaType.image("png");
- private static final MediaType TYPE_EMF = MediaType.application("x-emf");
+public class POIContainerExtractionTest extends AbstractPOIContainerExtractionTest {
/**
* For office files which don't have anything embedded in them
@@ -222,35 +204,4 @@ public class POIContainerExtractionTest
// Outlook with a pdf and another outlook message
// TODO
}
-
- private TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
- InputStream input = POIContainerExtractionTest.class.getResourceAsStream(
- "/test-documents/" + filename);
- assertNotNull(filename + " not found", input);
- TikaInputStream stream = TikaInputStream.get(input);
-
- assertEquals(true, extractor.isSupported(stream));
-
- // Process it
- TrackingHandler handler = new TrackingHandler();
- if(recurse) {
- extractor.extract(stream, extractor, handler);
- } else {
- extractor.extract(stream, null, handler);
- }
-
- // So they can check what happened
- return handler;
- }
-
- private static class TrackingHandler implements EmbeddedResourceHandler {
- private List<String> filenames = new ArrayList<String>();
- private List<MediaType> mediaTypes = new ArrayList<MediaType>();
-
- public void handle(String filename, MediaType mediaType,
- InputStream stream) {
- filenames.add(filename);
- mediaTypes.add(mediaType);
- }
- }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java?rev=995951&r1=995950&r2=995951&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java Fri Sep 10 19:25:57 2010
@@ -16,37 +16,18 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-import junit.framework.TestCase;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.ContainerAwareDetector;
import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest;
/**
* Tests that the various POI OOXML powered parsers are
* able to extract their embedded contents.
*/
-public class OOXMLContainerExtractionTest extends TestCase {
- private static final MediaType TYPE_DOC = MediaType.application("msword");
- private static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint");
- private static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel");
- private static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
- private static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
- private static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
-
- private static final MediaType TYPE_JPG = MediaType.image("jpeg");
- private static final MediaType TYPE_PNG = MediaType.image("png");
- private static final MediaType TYPE_EMF = MediaType.image("x-emf");
-
+public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtractionTest {
private ContainerExtractor extractor;
@Override
@@ -220,46 +201,5 @@ public class OOXMLContainerExtractionTes
// PowerPoint with excel and word
// TODO
-
-
- // Outlook with a text file and a word document
- // TODO
-
-
- // Outlook with a pdf and another outlook message
- // TODO
- }
-
- private TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
- InputStream input = OOXMLContainerExtractionTest.class.getResourceAsStream(
- "/test-documents/" + filename);
- assertNotNull(filename + " not found", input);
-
- TikaInputStream stream = TikaInputStream.get(input);
- assertNotNull(stream);
-
- assertEquals(true, extractor.isSupported(stream));
-
- // Process it
- TrackingHandler handler = new TrackingHandler();
- if(recurse) {
- extractor.extract(stream, extractor, handler);
- } else {
- extractor.extract(stream, null, handler);
- }
-
- // So they can check what happened
- return handler;
- }
-
- private static class TrackingHandler implements EmbeddedResourceHandler {
- private List<String> filenames = new ArrayList<String>();
- private List<MediaType> mediaTypes = new ArrayList<MediaType>();
-
- public void handle(String filename, MediaType mediaType,
- InputStream stream) {
- filenames.add(filename);
- mediaTypes.add(mediaType);
- }
}
}