You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/11/25 16:26:02 UTC
svn commit: r1206213 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/OfficeParser.java
test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java
Author: nick
Date: Fri Nov 25 15:26:02 2011
New Revision: 1206213
URL: http://svn.apache.org/viewvc?rev=1206213&view=rev
Log:
TIKA-789 Add (metadata only) Project support to OfficeParser, and add a unit test that checks we correctly get Project metadata back from our sample files
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1206213&r1=1206212&r2=1206213&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Fri Nov 25 15:26:02 2011
@@ -81,6 +81,7 @@ public class OfficeParser extends Abstra
ENCRYPTED("ole", MediaType.application("x-tika-msoffice")),
POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
PUBLISHER("pub", MediaType.application("x-mspublisher")),
+ PROJECT("mpp", MediaType.application("vnd.ms-project")),
VISIO("vsd", MediaType.application("vnd.visio")),
WORKS("wps", MediaType.application("vnd.ms-works")),
OUTLOOK("msg", MediaType.application("vnd.ms-outlook"));
@@ -119,6 +120,7 @@ public class OfficeParser extends Abstra
return UNKNOWN;
}
+ // TODO Avoid this duplication with POIFSContainerDetector (TIKA-790)
private final static Map<String,POIFSDocumentType> typeMap = new HashMap<String,POIFSDocumentType>();
static {
typeMap.put("Workbook", WORKBOOK);
@@ -129,6 +131,9 @@ public class OfficeParser extends Abstra
typeMap.put("VisioDocument", VISIO);
typeMap.put("CONTENTS", WORKS);
typeMap.put("\u0001Ole10Native", POIFSDocumentType.OLE10_NATIVE);
+ typeMap.put("Props", PROJECT); // Project 8
+ typeMap.put("Props9", PROJECT); // Project 9, 10, 11
+ typeMap.put("Props12", PROJECT); // Project 12+
}
public static POIFSDocumentType detectType(Entry entry) {
@@ -210,6 +215,9 @@ public class OfficeParser extends Abstra
Locale locale = context.get(Locale.class, Locale.getDefault());
new ExcelExtractor(context).parse(root, xhtml, locale);
break;
+ case PROJECT:
+ // We currently can't do anything beyond the metadata
+ break;
case VISIO:
VisioTextExtractor visioTextExtractor =
new VisioTextExtractor(root);
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java?rev=1206213&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java Fri Nov 25 15:26:02 2011
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for Microsoft Project (MPP) Files.
+ *
+ * Note - we don't currently have a dedicated Project
+ * Parser, all we have is the common office metadata
+ */
+public class ProjectParserTest extends TestCase {
+ public void testProject2003() throws Exception {
+ InputStream input = ProjectParserTest.class.getResourceAsStream(
+ "/test-documents/testPROJECT2003.mpp");
+ try {
+ doTestProject(input);
+ } finally {
+ input.close();
+ }
+ }
+
+ public void testProject2007() throws Exception {
+ InputStream input = ProjectParserTest.class.getResourceAsStream(
+ "/test-documents/testPROJECT2007.mpp");
+ try {
+ doTestProject(input);
+ } finally {
+ input.close();
+ }
+ }
+
+ private void doTestProject(InputStream input) throws Exception {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals(
+ "application/vnd.ms-project",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(Metadata.TITLE));
+ assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
+ assertEquals("Nevin Nollop", metadata.get(Metadata.AUTHOR));
+ assertEquals("", metadata.get(Metadata.LAST_AUTHOR));
+ assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
+ assertEquals("Comment Vulpes vulpes comment", metadata.get(Metadata.COMMENTS));
+
+ assertEquals("Category1", metadata.get(Metadata.CATEGORY));
+ assertEquals("Mr Burns", metadata.get(Metadata.MANAGER));
+ assertEquals("CompanyA", metadata.get(Metadata.COMPANY));
+
+ assertEquals("2011-11-24T10:58:00Z", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("2011-11-24T11:31:00Z", metadata.get(Metadata.LAST_SAVED));
+
+ // Custom Project metadata is present with prefix
+ assertEquals("0%", metadata.get("custom:% Complete"));
+ assertEquals("0%", metadata.get("custom:% Work Complete"));
+ assertEquals("\u00a3"+"0.00", metadata.get("custom:Cost"));
+ assertEquals("2d?", metadata.get("custom:Duration"));
+ assertEquals("16h", metadata.get("custom:Work"));
+
+ // Currently, we don't do textual contents of the file
+ String content = handler.toString();
+ assertEquals("", content);
+ }
+}