You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/19 14:11:42 UTC
[tika] 02/02: TIKA-1195 and TIKA-2329
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 67612b8f805ad5d1085db14922d3b3b6ddce19bf
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Apr 19 10:11:29 2017 -0400
TIKA-1195 and TIKA-2329
---
CHANGES.txt | 14 +-
tika-parsers/pom.xml | 2 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 5 +-
.../tika/parser/microsoft/ooxml/OOXMLParser.java | 9 +-
.../ooxml/XSSFBExcelExtractorDecorator.java | 282 +++++++++++++++++++++
.../ooxml/XSSFExcelExtractorDecorator.java | 10 +-
.../tika/parser/microsoft/ExcelParserTest.java | 38 ---
.../parser/microsoft/ooxml/OOXMLParserTest.java | 61 +++--
8 files changed, 347 insertions(+), 74 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 1fe98a7..610c186 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,12 @@
Release 1.15 - ??
+ * Change default behavior to parse embedded documents even if the user
+ forgets to specify a Parser.class in the ParseContext (TIKA-2096).
+ Users who wish to parse only the container document should set
+ an EmptyParser as the Parser.class in the ParseContext.
+
+ * Add support for the XLSB format (TIKA-1195).
+
* Change default behavior of Office Parsers to _not_ extract
Macros. User needs to setExtractMacros to "true" (TIKA-2302).
@@ -64,14 +71,9 @@ Release 1.15 - ??
* Added experimental SAX parser for .docx files. To select this parser,
set useSAXDocxExtractor(true) on OfficeParserConfig (TIKA-1321, TIKA-2191).
- * Change default behavior to parse embedded documents even if the user
- forgets to specify a Parser.class in the ParseContext (TIKA-2096).
- Users who wish to parse only the container document should set
- an EmptyParser as the Parser.class in the ParseContext.
-
* Add mime detection and parser for Word 2006ML format (TIKA-2179).
- * Upgrade to POI 3.16-beta2 (TIKA-2116, TIKA-2181).
+ * Upgrade to POI 3.16 (TIKA-2116, TIKA-2181, TIKA-2329).
* Allow configuration of timeout for ForkParser (TIKA-2170).
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index e4d04ca..58ac745 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -35,7 +35,7 @@
<url>http://tika.apache.org/</url>
<properties>
- <poi.version>3.16-beta2</poi.version>
+ <poi.version>3.16</poi.version>
<!-- NOTE: sync codec version with POI -->
<codec.version>1.10</codec.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 86d74df..92963a8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -32,6 +32,7 @@ import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFRelation;
+import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
@@ -104,8 +105,10 @@ public class OOXMLExtractorFactory {
}
POIXMLDocument document = poiExtractor.getDocument();
+ if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
+ extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
- if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
+ } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
extractor = new XSSFExcelExtractorDecorator(
context, poiExtractor, locale);
} else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
index 10af01c..fbc0f93 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
@@ -74,6 +74,8 @@ public class OOXMLParser extends AbstractOfficeParser {
MediaType.application("vnd.ms-visio.drawing"),
MediaType.application("vnd.ms-xpsdocument"),
MediaType.parse("model/vnd.dwfx+xps")
+ // MediaType.application("x-tika-ooxml")
+
)));
/**
* We claim to support all OOXML files, but we actually don't support a small
@@ -82,10 +84,9 @@ public class OOXMLParser extends AbstractOfficeParser {
* by Tika and/or POI.
*/
protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("vnd.ms-xpsdocument"),
- MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12")
- )));
+ Collections.singleton(
+ MediaType.application("vnd.ms-xpsdocument")
+ );
/**
* Serial version UID
*/
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
new file mode 100644
index 0000000..374fcb6
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackagePartName;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackagingURIHelper;
+import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.xssf.binary.XSSFBCommentsTable;
+import org.apache.poi.xssf.binary.XSSFBSharedStringsTable;
+import org.apache.poi.xssf.binary.XSSFBSheetHandler;
+import org.apache.poi.xssf.binary.XSSFBStylesTable;
+import org.apache.poi.xssf.eventusermodel.XSSFBReader;
+import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
+import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
+import org.apache.poi.xssf.usermodel.XSSFShape;
+import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
+import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape;
+import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator {
+
+ public XSSFBExcelExtractorDecorator(
+ ParseContext context, POIXMLTextExtractor extractor, Locale locale) {
+ super(context, extractor, locale);
+ }
+
+ @Override
+ protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
+ //need to override this because setFormulasNotResults is not yet available
+ //for xlsb
+ //((XSSFBEventBasedExcelExtractor)extractor).setFormulasNotResults(false);
+ ((XSSFBEventBasedExcelExtractor)extractor).setLocale(locale);
+ }
+
+ @Override
+ public void getXHTML(
+ ContentHandler handler, Metadata metadata, ParseContext context)
+ throws SAXException, XmlException, IOException, TikaException {
+
+ this.metadata = metadata;
+ this.parseContext = context;
+ metadata.set(TikaMetadataKeys.PROTECTED, "false");
+
+ super.getXHTML(handler, metadata, context);
+ }
+
+ /**
+ * @see org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor#getText()
+ */
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
+ XmlException, IOException {
+ OPCPackage container = extractor.getPackage();
+
+ XSSFBSharedStringsTable strings;
+ XSSFBReader.SheetIterator iter;
+ XSSFBReader xssfReader;
+ XSSFBStylesTable styles;
+ try {
+ xssfReader = new XSSFBReader(container);
+ styles = xssfReader.getXSSFBStylesTable();
+ iter = (XSSFBReader.SheetIterator) xssfReader.getSheetsData();
+ strings = new XSSFBSharedStringsTable(container);
+ } catch (InvalidFormatException e) {
+ throw new XmlException(e);
+ } catch (OpenXML4JException oe) {
+ throw new XmlException(oe);
+ }
+
+ while (iter.hasNext()) {
+ InputStream stream = iter.next();
+ PackagePart sheetPart = iter.getSheetPart();
+ addDrawingHyperLinks(sheetPart);
+ sheetParts.add(sheetPart);
+
+ SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
+ XSSFBCommentsTable comments = iter.getXSSFBSheetComments();
+
+ // Start, and output the sheet name
+ xhtml.startElement("div");
+ xhtml.element("h1", iter.getSheetName());
+
+ // Extract the main sheet contents
+ xhtml.startElement("table");
+ xhtml.startElement("tbody");
+
+ processSheet(sheetExtractor, comments, styles, strings, stream);
+
+ xhtml.endElement("tbody");
+ xhtml.endElement("table");
+
+ // Output any headers and footers
+ // (Need to process the sheet to get them, so we can't
+ // do the headers before the contents)
+ for (String header : sheetExtractor.headers) {
+ extractHeaderFooter(header, xhtml);
+ }
+ for (String footer : sheetExtractor.footers) {
+ extractHeaderFooter(footer, xhtml);
+ }
+ List<XSSFShape> shapes = iter.getShapes();
+ processShapes(shapes, xhtml);
+
+ //for now dump sheet hyperlinks at bottom of page
+ //consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes
+ //step 1: extract hyperlink info from bottom of page
+ //step 2: process as we do now, but with cached hyperlink relationship info
+ extractHyperLinks(sheetPart, xhtml);
+ // All done with this sheet
+ xhtml.endElement("div");
+ }
+ }
+
+ @Override
+ protected void extractHeaderFooter(String hf, XHTMLContentHandler xhtml)
+ throws SAXException {
+ if (hf.length() > 0) {
+ xhtml.element("p", hf);
+ }
+ }
+
+ private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException {
+ try {
+ for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
+ xhtml.startElement("a", "href", rel.getTargetURI().toString());
+ xhtml.characters(rel.getTargetURI().toString());
+ xhtml.endElement("a");
+ }
+ } catch (InvalidFormatException e) {
+ //swallow
+ }
+ }
+
+ private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
+ if (shapes == null) {
+ return;
+ }
+ for (XSSFShape shape : shapes) {
+ if (shape instanceof XSSFSimpleShape) {
+ String sText = ((XSSFSimpleShape) shape).getText();
+ if (sText != null && sText.length() > 0) {
+ xhtml.element("p", sText);
+ }
+ extractHyperLinksFromShape(((XSSFSimpleShape)shape).getCTShape(), xhtml);
+ }
+ }
+ }
+
+ private void extractHyperLinksFromShape(CTShape ctShape, XHTMLContentHandler xhtml) throws SAXException {
+
+ if (ctShape == null)
+ return;
+
+ CTShapeNonVisual nvSpPR = ctShape.getNvSpPr();
+ if (nvSpPR == null)
+ return;
+
+ CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr();
+ if (cNvPr == null)
+ return;
+
+ CTHyperlink ctHyperlink = cNvPr.getHlinkClick();
+ if (ctHyperlink == null)
+ return;
+
+ String url = drawingHyperlinks.get(ctHyperlink.getId());
+ if (url != null) {
+ xhtml.startElement("a", "href", url);
+ xhtml.characters(url);
+ xhtml.endElement("a");
+ }
+
+ CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover();
+ if (ctHoverHyperlink == null)
+ return;
+
+ url = drawingHyperlinks.get(ctHoverHyperlink.getId());
+ if (url != null) {
+ xhtml.startElement("a", "href", url);
+ xhtml.characters(url);
+ xhtml.endElement("a");
+ }
+
+ }
+
+ private void processSheet(
+ SheetContentsHandler sheetContentsExtractor,
+ XSSFBCommentsTable comments,
+ XSSFBStylesTable styles,
+ XSSFBSharedStringsTable strings,
+ InputStream sheetInputStream)
+ throws IOException, SAXException {
+
+ XSSFBSheetHandler xssfbSheetHandler = new XSSFBSheetHandler(
+ sheetInputStream,
+ styles,
+ comments,
+ strings,
+ sheetContentsExtractor,
+ formatter,
+ false
+ );
+ xssfbSheetHandler.parse();
+ }
+
+ /**
+ * In Excel files, sheets have things embedded in them,
+ * and sheet drawings which have the images
+ */
+ @Override
+ protected List<PackagePart> getMainDocumentParts() throws TikaException {
+ List<PackagePart> parts = new ArrayList<PackagePart>();
+ for (PackagePart part : sheetParts) {
+ // Add the sheet
+ parts.add(part);
+
+ // If it has drawings, return those too
+ try {
+ for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
+ if (rel.getTargetMode() == TargetMode.INTERNAL) {
+ PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
+ parts.add(rel.getPackage().getPart(relName));
+ }
+ }
+ for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
+ if (rel.getTargetMode() == TargetMode.INTERNAL) {
+ PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
+ parts.add(rel.getPackage().getPart(relName));
+ }
+ }
+ } catch (InvalidFormatException e) {
+ throw new TikaException("Broken OOXML file", e);
+ }
+ }
+
+ //add main document so that macros can be extracted
+ //by AbstractOOXMLExtractor
+ for (PackagePart part : extractor.getPackage().
+ getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
+ parts.add(part);
+ }
+
+ return parts;
+ }
+}
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index a8bee1e..dbf21d1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -71,7 +71,6 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
* Allows access to headers/footers from raw xml strings
*/
protected static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
- private final XSSFEventBasedExcelExtractor extractor;
protected final DataFormatter formatter;
protected final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
protected final Map<String, String> drawingHyperlinks = new HashMap<>();
@@ -84,9 +83,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
this.parseContext = context;
this.extractor = (XSSFEventBasedExcelExtractor)extractor;
- // not yet supported in POI-3.16-beta3
- // this.extractor.setFormulasNotResults(false);
- this.extractor.setLocale(locale);
+ configureExtractor(this.extractor, locale);
if (locale == null) {
formatter = new TikaExcelDataFormatter();
@@ -95,6 +92,11 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
}
}
+ protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
+ ((XSSFEventBasedExcelExtractor)extractor).setFormulasNotResults(false);
+ ((XSSFEventBasedExcelExtractor)extractor).setLocale(locale);
+ }
+
@Override
public void getXHTML(
ContentHandler handler, Metadata metadata, ParseContext context)
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index cea5e9f..fc31958 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -267,44 +267,6 @@ public class ExcelParserTest extends TikaTest {
}
}
- /**
- * We don't currently support the .xlsb file format
- * (an OOXML container with binary blobs), but we
- * shouldn't break on these files either (TIKA-826)
- */
- @Test
- public void testExcelXLSB() throws Exception {
- Detector detector = new DefaultDetector();
- AutoDetectParser parser = new AutoDetectParser();
-
- Metadata m = new Metadata();
- m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
-
- // Should be detected correctly
- MediaType type;
- try (InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/testEXCEL.xlsb")) {
- type = detector.detect(input, m);
- assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
- }
-
- // OfficeParser won't handle it
- assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
-
- // OOXMLParser will (soon) handle it
- assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
-
- // AutoDetectParser doesn't break on it
- try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- parser.parse(input, handler, m, context);
-
- String content = handler.toString();
- assertEquals("", content);
- }
- }
/**
* Excel 5 and 95 are older formats, and only get basic support
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index af1ba27..6420545 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -27,7 +27,6 @@ import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import java.io.ByteArrayOutputStream;
-import java.io.EOFException;
import java.io.File;
import java.io.InputStream;
import java.io.PrintStream;
@@ -43,8 +42,9 @@ import java.util.Map;
import org.apache.poi.util.LocaleUtil;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -52,19 +52,21 @@ import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.microsoft.ExcelParserTest;
+import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.WordParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
public class OOXMLParserTest extends TikaTest {
@@ -1430,10 +1432,43 @@ public class OOXMLParserTest extends TikaTest {
}
@Test
- @Ignore("until poi-3.16-beta3")
+ public void testExcelXLSB() throws Exception {
+ Detector detector = new DefaultDetector();
+ AutoDetectParser parser = new AutoDetectParser();
+
+ Metadata m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
+
+ // Should be detected correctly
+ MediaType type;
+ try (InputStream input = ExcelParserTest.class.getResourceAsStream(
+ "/test-documents/testEXCEL.xlsb")) {
+ type = detector.detect(input, m);
+ assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
+ }
+
+ // OfficeParser won't handle it
+ assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+ // OOXMLParser will (soon) handle it
+ assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+ // AutoDetectParser doesn't break on it
+ try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ parser.parse(input, handler, m, context);
+
+ String content = handler.toString();
+ assertContains("This is an example spreadsheet", content);
+ }
+ }
+
+ @Test
public void testXLSBVarious() throws Exception {
- //make sure to turn MACROs on, after we turn them off by default
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setExtractMacros(true);
ParseContext parseContext = new ParseContext();
parseContext.set(OfficeParserConfig.class, officeParserConfig);
List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_various.xlsb", parseContext);
@@ -1473,22 +1508,8 @@ public class OOXMLParserTest extends TikaTest {
assertContains("OddLeftFooter OddCenterFooter OddRightFooter", xml);
assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml);
assertContains("FirstPageLeftFooter FirstPageCenterFooter FirstPageRightFooter", xml);
- }
- @Test
- public void testTruncated() throws Exception {
- Parser p = new AutoDetectParser();
- ContentHandler handler = new DefaultHandler();
- Metadata metadata = new Metadata();
- ParseContext parseContext = new ParseContext();
- try (InputStream is = getTestDocument("testWORD_truncated.docx")) {
- p.parse(is, handler, metadata, parseContext);
- fail("should have thrown an EOF exception?!");
- } catch (TikaException e) {
- Throwable cause = e.getCause();
- assertTrue(cause instanceof EOFException);
- assertEquals("application/x-tika-ooxml", metadata.get(Metadata.CONTENT_TYPE));
- }
+
}
}
--
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.