You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/11/25 16:26:02 UTC

svn commit: r1206213 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/OfficeParser.java test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java

Author: nick
Date: Fri Nov 25 15:26:02 2011
New Revision: 1206213

URL: http://svn.apache.org/viewvc?rev=1206213&view=rev
Log:
TIKA-789 Add (metadata only) Project support to OfficeParser, and add a unit test that checks we correctly get Project metadata back from our sample files

Added:
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1206213&r1=1206212&r2=1206213&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Fri Nov 25 15:26:02 2011
@@ -81,6 +81,7 @@ public class OfficeParser extends Abstra
         ENCRYPTED("ole", MediaType.application("x-tika-msoffice")),
         POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
         PUBLISHER("pub", MediaType.application("x-mspublisher")),
+        PROJECT("mpp", MediaType.application("vnd.ms-project")),
         VISIO("vsd", MediaType.application("vnd.visio")),
         WORKS("wps", MediaType.application("vnd.ms-works")),
         OUTLOOK("msg", MediaType.application("vnd.ms-outlook"));
@@ -119,6 +120,7 @@ public class OfficeParser extends Abstra
             return UNKNOWN;
         }
 
+        // TODO Avoid this duplication with POIFSContainerDetector (TIKA-790)
         private final static Map<String,POIFSDocumentType> typeMap = new HashMap<String,POIFSDocumentType>();
         static {
             typeMap.put("Workbook", WORKBOOK);
@@ -129,6 +131,9 @@ public class OfficeParser extends Abstra
             typeMap.put("VisioDocument", VISIO);
             typeMap.put("CONTENTS", WORKS);
             typeMap.put("\u0001Ole10Native", POIFSDocumentType.OLE10_NATIVE);
+            typeMap.put("Props", PROJECT);  // Project 8
+            typeMap.put("Props9", PROJECT); // Project 9, 10, 11
+            typeMap.put("Props12", PROJECT); // Project 12+
         }
 
         public static POIFSDocumentType detectType(Entry entry) {
@@ -210,6 +215,9 @@ public class OfficeParser extends Abstra
                     Locale locale = context.get(Locale.class, Locale.getDefault());
                     new ExcelExtractor(context).parse(root, xhtml, locale);
                     break;
+                case PROJECT:
+                    // We currently can't do anything beyond the metadata
+                    break;
                 case VISIO:
                     VisioTextExtractor visioTextExtractor =
                         new VisioTextExtractor(root);

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java?rev=1206213&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java Fri Nov 25 15:26:02 2011
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for Microsoft Project (MPP) Files.
+ * 
+ * Note - we don't currently have a dedicated Project
+ *  Parser, all we have is the common office metadata
+ */
+public class ProjectParserTest extends TestCase {
+    public void testProject2003() throws Exception {
+       InputStream input = ProjectParserTest.class.getResourceAsStream(
+             "/test-documents/testPROJECT2003.mpp");
+       try {
+          doTestProject(input);
+       } finally {
+          input.close();
+       }
+    }
+
+    public void testProject2007() throws Exception {
+        InputStream input = ProjectParserTest.class.getResourceAsStream(
+                "/test-documents/testPROJECT2007.mpp");
+        try {
+            doTestProject(input);
+        } finally {
+            input.close();
+        }
+    }
+
+    private void doTestProject(InputStream input) throws Exception {
+       Metadata metadata = new Metadata();
+       ContentHandler handler = new BodyContentHandler();
+       new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+       assertEquals(
+               "application/vnd.ms-project",
+               metadata.get(Metadata.CONTENT_TYPE));
+       
+       assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(Metadata.TITLE));
+       assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
+       assertEquals("Nevin Nollop", metadata.get(Metadata.AUTHOR));
+       assertEquals("", metadata.get(Metadata.LAST_AUTHOR));
+       assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
+       assertEquals("Comment Vulpes vulpes comment", metadata.get(Metadata.COMMENTS));
+       
+       assertEquals("Category1", metadata.get(Metadata.CATEGORY));
+       assertEquals("Mr Burns", metadata.get(Metadata.MANAGER));
+       assertEquals("CompanyA", metadata.get(Metadata.COMPANY));
+       
+       assertEquals("2011-11-24T10:58:00Z", metadata.get(Metadata.CREATION_DATE));
+       assertEquals("2011-11-24T11:31:00Z", metadata.get(Metadata.LAST_SAVED));
+       
+       // Custom Project metadata is present with prefix
+       assertEquals("0%", metadata.get("custom:% Complete"));
+       assertEquals("0%", metadata.get("custom:% Work Complete"));
+       assertEquals("\u00a3"+"0.00", metadata.get("custom:Cost"));
+       assertEquals("2d?", metadata.get("custom:Duration"));
+       assertEquals("16h", metadata.get("custom:Work"));
+       
+       // Currently, we don't do textual contents of the file
+       String content = handler.toString();
+       assertEquals("", content);
+    }
+}