You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2012/12/17 05:22:20 UTC
svn commit: r1422750 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/ test/java/org/apache/tika/detect/
test/java/org/apache/tika/parser/microsoft/ test/resources/test-documents/
Author: nick
Date: Mon Dec 17 04:22:19 2012
New Revision: 1422750
URL: http://svn.apache.org/viewvc?rev=1422750&view=rev
Log:
TIKA-976 Excel95 files should be correctly detected, but as POI HSSF does not support them they should not generate exceptions if you try to parse one
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_95.xls (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1422750&r1=1422749&r2=1422750&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Mon Dec 17 04:22:19 2012
@@ -94,6 +94,8 @@ public class ExcelExtractor extends Abst
* records.
*/
private boolean listenForAllRecords = false;
+
+ private static final String WORKBOOK_ENTRY = "Workbook";
public ExcelExtractor(ParseContext context) {
super(context);
@@ -140,6 +142,11 @@ public class ExcelExtractor extends Abst
protected void parse(
DirectoryNode root, XHTMLContentHandler xhtml,
Locale locale) throws IOException, SAXException, TikaException {
+ if (! root.hasEntry(WORKBOOK_ENTRY)) {
+ // Corrupt file / very old file, just skip
+ return;
+ }
+
TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
listener.processFile(root, isListenForAllRecords());
listener.throwStoredException();
@@ -286,7 +293,7 @@ public class ExcelExtractor extends Abst
}
// Create event factory and process Workbook (fire events)
- DocumentInputStream documentInputStream = root.createDocumentInputStream("Workbook");
+ DocumentInputStream documentInputStream = root.createDocumentInputStream(WORKBOOK_ENTRY);
HSSFEventFactory eventFactory = new HSSFEventFactory();
try {
eventFactory.processEvents(hssfRequest, documentInputStream);
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1422750&r1=1422749&r2=1422750&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Mon Dec 17 04:22:19 2012
@@ -223,8 +223,11 @@ public class POIFSContainerDetector impl
// Works 7.0 spreadsheet files contain both
// we want to avoid classifying this as Excel
return XLR;
- } else if (names.contains("Workbook")) {
+ } else if (names.contains("Workbook") || names.contains("WORKBOOK")) {
return XLS;
+ } else if (names.contains("Book")) {
+ // Excel 95 or older, we won't be able to parse this....
+ return XLS;
} else if (names.contains("EncryptedPackage") &&
names.contains("EncryptionInfo") &&
names.contains("\u0006DataSpaces")) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1422750&r1=1422749&r2=1422750&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Mon Dec 17 04:22:19 2012
@@ -82,6 +82,8 @@ public class TestContainerAwareDetector
assertTypeByData("testWORKSSpreadsheet7.0.xlr", "application/x-tika-msworks-spreadsheet");
assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project");
assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project");
+ // Excel95 can be detected by not parsed
+ assertTypeByData("testEXCEL_95.xls", "application/vnd.ms-excel");
// Try some ones that POI doesn't handle, that are still OLE2 based
assertTypeByData("testCOREL.shw", "application/x-corelpresentations");
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=1422750&r1=1422749&r2=1422750&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java Mon Dec 17 04:22:19 2012
@@ -265,6 +265,50 @@ public class ExcelParserTest extends Tes
input.close();
}
}
+
+ /**
+ * We don't currently support the old Excel 95 .xls file format,
+ * but we shouldn't break on these files either (TIKA-976)
+ */
+ public void testExcel95() throws Exception {
+ Detector detector = new DefaultDetector();
+ AutoDetectParser parser = new AutoDetectParser();
+
+ InputStream input = ExcelParserTest.class.getResourceAsStream(
+ "/test-documents/testEXCEL_95.xls");
+ Metadata m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
+
+ // Should be detected correctly
+ MediaType type = null;
+ try {
+ type = detector.detect(input, m);
+ assertEquals("application/vnd.ms-excel", type.toString());
+ } finally {
+ input.close();
+ }
+
+ // OfficeParser will claim to handle it
+ assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+ // OOXMLParser won't handle it
+ assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+ // AutoDetectParser doesn't break on it
+ input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls");
+
+ try {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ parser.parse(input, handler, m, context);
+
+ String content = handler.toString();
+ assertEquals("", content);
+ } finally {
+ input.close();
+ }
+ }
/**
* Ensures that custom OLE2 (HPSF) properties are extracted
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_95.xls
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_95.xls?rev=1422750&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_95.xls
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream