You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2012/12/17 05:22:20 UTC

svn commit: r1422750 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/ test/java/org/apache/tika/detect/ test/java/org/apache/tika/parser/microsoft/ test/resources/test-documents/

Author: nick
Date: Mon Dec 17 04:22:19 2012
New Revision: 1422750

URL: http://svn.apache.org/viewvc?rev=1422750&view=rev
Log:
TIKA-976 Excel95 files should be correctly detected, but as POI HSSF does not support them they should not generate exceptions if you try to parse one

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_95.xls   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1422750&r1=1422749&r2=1422750&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Mon Dec 17 04:22:19 2012
@@ -94,6 +94,8 @@ public class ExcelExtractor extends Abst
      * records.
      */
     private boolean listenForAllRecords = false;
+    
+    private static final String WORKBOOK_ENTRY = "Workbook";
 
     public ExcelExtractor(ParseContext context) {
         super(context);
@@ -140,6 +142,11 @@ public class ExcelExtractor extends Abst
     protected void parse(
             DirectoryNode root, XHTMLContentHandler xhtml,
             Locale locale) throws IOException, SAXException, TikaException {
+        if (! root.hasEntry(WORKBOOK_ENTRY)) {
+           // Corrupt file / very old file, just skip
+           return;
+        }
+       
         TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
         listener.processFile(root, isListenForAllRecords());
         listener.throwStoredException();
@@ -286,7 +293,7 @@ public class ExcelExtractor extends Abst
             }
 
             // Create event factory and process Workbook (fire events)
-            DocumentInputStream documentInputStream = root.createDocumentInputStream("Workbook");
+            DocumentInputStream documentInputStream = root.createDocumentInputStream(WORKBOOK_ENTRY);
             HSSFEventFactory eventFactory = new HSSFEventFactory();
             try {
                 eventFactory.processEvents(hssfRequest, documentInputStream);

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1422750&r1=1422749&r2=1422750&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Mon Dec 17 04:22:19 2012
@@ -223,8 +223,11 @@ public class POIFSContainerDetector impl
                 // Works 7.0 spreadsheet files contain both
                 // we want to avoid classifying this as Excel
                 return XLR; 
-            } else if (names.contains("Workbook")) {
+            } else if (names.contains("Workbook") || names.contains("WORKBOOK")) {
                 return XLS;
+            } else if (names.contains("Book")) {
+               // Excel 95 or older, we won't be able to parse this....
+               return XLS;
             } else if (names.contains("EncryptedPackage") && 
                     names.contains("EncryptionInfo") &&
                     names.contains("\u0006DataSpaces")) {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1422750&r1=1422749&r2=1422750&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Mon Dec 17 04:22:19 2012
@@ -82,6 +82,8 @@ public class TestContainerAwareDetector 
         assertTypeByData("testWORKSSpreadsheet7.0.xlr", "application/x-tika-msworks-spreadsheet");
         assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project");
         assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project");
+        // Excel95 can be detected by not parsed
+        assertTypeByData("testEXCEL_95.xls", "application/vnd.ms-excel");
 
         // Try some ones that POI doesn't handle, that are still OLE2 based
         assertTypeByData("testCOREL.shw", "application/x-corelpresentations");

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=1422750&r1=1422749&r2=1422750&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java Mon Dec 17 04:22:19 2012
@@ -265,6 +265,50 @@ public class ExcelParserTest extends Tes
           input.close();
        }
     }
+
+    /**
+     * We don't currently support the old Excel 95 .xls file format, 
+     *  but we shouldn't break on these files either (TIKA-976)  
+     */
+    public void testExcel95() throws Exception {
+       Detector detector = new DefaultDetector();
+       AutoDetectParser parser = new AutoDetectParser();
+       
+       InputStream input = ExcelParserTest.class.getResourceAsStream(
+             "/test-documents/testEXCEL_95.xls");
+       Metadata m = new Metadata();
+       m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
+       
+       // Should be detected correctly
+       MediaType type = null;
+       try {
+          type = detector.detect(input, m);
+          assertEquals("application/vnd.ms-excel", type.toString());
+       } finally {
+          input.close();
+       }
+       
+       // OfficeParser will claim to handle it
+       assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
+       
+       // OOXMLParser won't handle it
+       assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
+       
+       // AutoDetectParser doesn't break on it
+       input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls");
+
+       try {
+          ContentHandler handler = new BodyContentHandler(-1);
+          ParseContext context = new ParseContext();
+          context.set(Locale.class, Locale.US);
+          parser.parse(input, handler, m, context);
+
+          String content = handler.toString();
+          assertEquals("", content);
+       } finally {
+          input.close();
+       }
+    }
     
     /**
      * Ensures that custom OLE2 (HPSF) properties are extracted

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_95.xls
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_95.xls?rev=1422750&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_95.xls
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream