You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/11/25 16:24:09 UTC
svn commit: r1206212 - in /tika/trunk:
tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
Author: nick
Date: Fri Nov 25 15:24:09 2011
New Revision: 1206212
URL: http://svn.apache.org/viewvc?rev=1206212&view=rev
Log:
TIKA-789 Improve MPP detection based on info from Alex Ott
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1206212&r1=1206211&r2=1206212&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Fri Nov 25 15:24:09 2011
@@ -1327,7 +1327,7 @@
<mime-type type="application/vnd.ms-project">
<glob pattern="*.mpp"/>
<glob pattern="*.mpt"/>
- <sub-class-of type="application/x-tika-ooxml"/>
+ <sub-class-of type="application/x-tika-msoffice"/>
</mime-type>
<mime-type type="application/vnd.ms-tnef">
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1206212&r1=1206211&r2=1206212&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Fri Nov 25 15:24:09 2011
@@ -72,9 +72,8 @@ public class POIFSContainerDetector impl
/** Microsoft Project */
public static final MediaType MPP = application("vnd.ms-project");
- /** Regexp for matching the MPP Project Properties stream */
+ /** Regexp for matching the MPP Project Data stream */
private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
- private static final Pattern mppPropsMatch = Pattern.compile("Props\\d+");
public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
@@ -142,16 +141,13 @@ public class POIFSContainerDetector impl
// of embedded non-office file inside an OLE2 document
// This is most commonly triggered on nested directories
return OLE;
- } else if (names.contains("\u0001CompObj")) {
+ } else if (names.contains("\u0001CompObj") &&
+ (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) {
// Could be Project, look for common name patterns
- boolean matchedProps = false;
- boolean matchedData = false;
for (String name : names) {
- if (mppDataMatch.matcher(name).matches()) matchedData = true;
- if (mppPropsMatch.matcher(name).matches()) matchedProps = true;
- }
- if (matchedProps && matchedData) {
- return MPP;
+ if (mppDataMatch.matcher(name).matches()) {
+ return MPP;
+ }
}
} else if (names.contains("\u0001Ole10Native")) {
return OLE;