You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/11/15 10:41:47 UTC

svn commit: r1202109 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java test/java/org/apache/tika/detect/TestContainerAwareDetector.java test/resources/test-documents/testWORKS2000.wps

Author: nick
Date: Tue Nov 15 09:41:46 2011
New Revision: 1202109

URL: http://svn.apache.org/viewvc?rev=1202109&view=rev
Log:
TIKA-779 Works 2000 container aware detection, plus test

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKS2000.wps   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1202109&r1=1202108&r2=1202109&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Tue Nov 15 09:41:46 2011
@@ -124,10 +124,14 @@ public class POIFSContainerDetector impl
             } else if (names.contains("VisioDocument")) {
                 return VSD;
             } else if (names.contains("CONTENTS") && names.contains("SPELLING")) {
+               // Newer Works files
+               return WPS;
+            } else if (names.contains("CONTENTS") && names.contains("\u0001CompObj")) {
+               // Normally an older Works file
                return WPS;
             } else if (names.contains("CONTENTS")) {
-               // CONTENTS without SPELLING normally means some sort of
-               //  embedded non-office file inside an OLE2 document
+               // CONTENTS without SPELLING nor CompObj normally means some sort
+               //  of embedded non-office file inside an OLE2 document
                // This is most commonly triggered on nested directories
                return OLE;
             } else if (names.contains("\u0001Ole10Native")) {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1202109&r1=1202108&r2=1202109&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Tue Nov 15 09:41:46 2011
@@ -56,6 +56,7 @@ public class TestContainerAwareDetector 
 
         // Try some ones that POI doesn't handle, that are still OLE2 based
         assertDetect("testWORKS.wps", "application/vnd.ms-works");
+        assertDetect("testWORKS2000.wps", "application/vnd.ms-works");
         assertDetect("testCOREL.shw", "application/x-corelpresentations");
         assertDetect("testQUATTRO.qpw", "application/x-quattro-pro");
         assertDetect("testQUATTRO.wb3", "application/x-quattro-pro");

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKS2000.wps
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKS2000.wps?rev=1202109&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORKS2000.wps
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream