You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/11/25 16:24:09 UTC

svn commit: r1206212 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java

Author: nick
Date: Fri Nov 25 15:24:09 2011
New Revision: 1206212

URL: http://svn.apache.org/viewvc?rev=1206212&view=rev
Log:
TIKA-789 Improve MPP detection based on info from Alex Ott

Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1206212&r1=1206211&r2=1206212&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Fri Nov 25 15:24:09 2011
@@ -1327,7 +1327,7 @@
   <mime-type type="application/vnd.ms-project">
     <glob pattern="*.mpp"/>
     <glob pattern="*.mpt"/>
-    <sub-class-of type="application/x-tika-ooxml"/>
+    <sub-class-of type="application/x-tika-msoffice"/>
   </mime-type>
 
   <mime-type type="application/vnd.ms-tnef">

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1206212&r1=1206211&r2=1206212&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Fri Nov 25 15:24:09 2011
@@ -72,9 +72,8 @@ public class POIFSContainerDetector impl
     /** Microsoft Project */
     public static final MediaType MPP = application("vnd.ms-project");
 
-    /** Regexp for matching the MPP Project Properties stream */
+    /** Regexp for matching the MPP Project Data stream */
     private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
-    private static final Pattern mppPropsMatch = Pattern.compile("Props\\d+");
     
     public MediaType detect(InputStream input, Metadata metadata)
              throws IOException {
@@ -142,16 +141,13 @@ public class POIFSContainerDetector impl
                //  of embedded non-office file inside an OLE2 document
                // This is most commonly triggered on nested directories
                return OLE;
-            } else if (names.contains("\u0001CompObj")) {
+            } else if (names.contains("\u0001CompObj") &&
+                  (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) {
                // Could be Project, look for common name patterns
-               boolean matchedProps = false;
-               boolean matchedData = false;
                for (String name : names) {
-                  if (mppDataMatch.matcher(name).matches()) matchedData = true;
-                  if (mppPropsMatch.matcher(name).matches()) matchedProps = true;
-               }
-               if (matchedProps && matchedData) {
-                  return MPP;
+                  if (mppDataMatch.matcher(name).matches()) {
+                     return MPP;
+                  }
                }
             } else if (names.contains("\u0001Ole10Native")) {
                 return OLE;