You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/19 14:11:40 UTC

[tika] branch master updated (3aab15f -> 67612b8)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.

      from  3aab15f   TIKA-2311 -- maintain mime information for truncated ooxml
       new  a970303   mock parser's uninterruptible sleep can happen to pause for exactly 3000 millis
       new  67612b8   TIKA-1195 and TIKA-2329

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "adds" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |  14 +-
 tika-parsers/pom.xml                               |   2 +-
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |   5 +-
 .../tika/parser/microsoft/ooxml/OOXMLParser.java   |   9 +-
 .../ooxml/XSSFBExcelExtractorDecorator.java        | 282 +++++++++++++++++++++
 .../ooxml/XSSFExcelExtractorDecorator.java         |  10 +-
 .../tika/parser/microsoft/ExcelParserTest.java     |  38 ---
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  61 +++--
 .../apache/tika/parser/mock/MockParserTest.java    |   2 +-
 9 files changed, 348 insertions(+), 75 deletions(-)
 create mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].

[tika] 02/02: TIKA-1195 and TIKA-2329

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 67612b8f805ad5d1085db14922d3b3b6ddce19bf
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Apr 19 10:11:29 2017 -0400

    TIKA-1195 and TIKA-2329
---
 CHANGES.txt                                        |  14 +-
 tika-parsers/pom.xml                               |   2 +-
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |   5 +-
 .../tika/parser/microsoft/ooxml/OOXMLParser.java   |   9 +-
 .../ooxml/XSSFBExcelExtractorDecorator.java        | 282 +++++++++++++++++++++
 .../ooxml/XSSFExcelExtractorDecorator.java         |  10 +-
 .../tika/parser/microsoft/ExcelParserTest.java     |  38 ---
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  61 +++--
 8 files changed, 347 insertions(+), 74 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 1fe98a7..610c186 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,12 @@
 Release 1.15 - ??
 
+  * Change default behavior to parse embedded documents even if the user
+    forgets to specify a Parser.class in the ParseContext (TIKA-2096).
+    Users who wish to parse only the container document should set
+    an EmptyParser as the Parser.class in the ParseContext.
+
+  * Add support for the XLSB format (TIKA-1195).
+
   * Change default behavior of Office Parsers to _not_ extract
     Macros.  User needs to setExtractMacros to "true" (TIKA-2302).
 
@@ -64,14 +71,9 @@ Release 1.15 - ??
   * Added experimental SAX parser for .docx files. To select this parser,
     set useSAXDocxExtractor(true) on OfficeParserConfig (TIKA-1321, TIKA-2191).
 
-  * Change default behavior to parse embedded documents even if the user
-    forgets to specify a Parser.class in the ParseContext (TIKA-2096).
-    Users who wish to parse only the container document should set
-    an EmptyParser as the Parser.class in the ParseContext.
-
   * Add mime detection and parser for Word 2006ML format (TIKA-2179).
 
-  * Upgrade to POI 3.16-beta2 (TIKA-2116, TIKA-2181).
+  * Upgrade to POI 3.16 (TIKA-2116, TIKA-2181, TIKA-2329).
 
   * Allow configuration of timeout for ForkParser (TIKA-2170).
 
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index e4d04ca..58ac745 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -35,7 +35,7 @@
   <url>http://tika.apache.org/</url>
 
   <properties>
-    <poi.version>3.16-beta2</poi.version>
+    <poi.version>3.16</poi.version>
     <!-- NOTE: sync codec version with POI -->
     <codec.version>1.10</codec.version>
     <!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 86d74df..92963a8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -32,6 +32,7 @@ import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.xslf.usermodel.XMLSlideShow;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
+import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
@@ -104,8 +105,10 @@ public class OOXMLExtractorFactory {
             }
 
             POIXMLDocument document = poiExtractor.getDocument();
+            if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
+                extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
 
-            if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
+            } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
                 extractor = new XSSFExcelExtractorDecorator(
                         context, poiExtractor, locale);
             } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
index 10af01c..fbc0f93 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
@@ -74,6 +74,8 @@ public class OOXMLParser extends AbstractOfficeParser {
                     MediaType.application("vnd.ms-visio.drawing"),
                     MediaType.application("vnd.ms-xpsdocument"),
                     MediaType.parse("model/vnd.dwfx+xps")
+                          //  MediaType.application("x-tika-ooxml")
+
                     )));
     /**
      * We claim to support all OOXML files, but we actually don't support a small
@@ -82,10 +84,9 @@ public class OOXMLParser extends AbstractOfficeParser {
      * by Tika and/or POI.
      */
     protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES =
-            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                    MediaType.application("vnd.ms-xpsdocument"),
-                    MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12")
-            )));
+            Collections.singleton(
+                    MediaType.application("vnd.ms-xpsdocument")
+            );
     /**
      * Serial version UID
      */
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
new file mode 100644
index 0000000..374fcb6
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackagePartName;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackagingURIHelper;
+import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.xssf.binary.XSSFBCommentsTable;
+import org.apache.poi.xssf.binary.XSSFBSharedStringsTable;
+import org.apache.poi.xssf.binary.XSSFBSheetHandler;
+import org.apache.poi.xssf.binary.XSSFBStylesTable;
+import org.apache.poi.xssf.eventusermodel.XSSFBReader;
+import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
+import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
+import org.apache.poi.xssf.usermodel.XSSFShape;
+import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
+import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape;
+import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator {
+
+    public XSSFBExcelExtractorDecorator(
+            ParseContext context, POIXMLTextExtractor extractor, Locale locale) {
+        super(context, extractor, locale);
+    }
+
+    @Override
+    protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
+        //need to override this because setFormulasNotResults is not yet available
+        //for xlsb
+       //((XSSFBEventBasedExcelExtractor)extractor).setFormulasNotResults(false);
+        ((XSSFBEventBasedExcelExtractor)extractor).setLocale(locale);
+    }
+
+    @Override
+    public void getXHTML(
+            ContentHandler handler, Metadata metadata, ParseContext context)
+            throws SAXException, XmlException, IOException, TikaException {
+
+        this.metadata = metadata;
+        this.parseContext = context;
+        metadata.set(TikaMetadataKeys.PROTECTED, "false");
+
+        super.getXHTML(handler, metadata, context);
+    }
+
+    /**
+     * @see org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor#getText()
+     */
+    @Override
+    protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
+            XmlException, IOException {
+        OPCPackage container = extractor.getPackage();
+
+        XSSFBSharedStringsTable strings;
+        XSSFBReader.SheetIterator iter;
+        XSSFBReader xssfReader;
+        XSSFBStylesTable styles;
+        try {
+            xssfReader = new XSSFBReader(container);
+            styles = xssfReader.getXSSFBStylesTable();
+            iter = (XSSFBReader.SheetIterator) xssfReader.getSheetsData();
+            strings = new XSSFBSharedStringsTable(container);
+        } catch (InvalidFormatException e) {
+            throw new XmlException(e);
+        } catch (OpenXML4JException oe) {
+            throw new XmlException(oe);
+        }
+
+        while (iter.hasNext()) {
+            InputStream stream = iter.next();
+            PackagePart sheetPart = iter.getSheetPart();
+            addDrawingHyperLinks(sheetPart);
+            sheetParts.add(sheetPart);
+
+            SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
+            XSSFBCommentsTable comments = iter.getXSSFBSheetComments();
+
+            // Start, and output the sheet name
+            xhtml.startElement("div");
+            xhtml.element("h1", iter.getSheetName());
+
+            // Extract the main sheet contents
+            xhtml.startElement("table");
+            xhtml.startElement("tbody");
+
+            processSheet(sheetExtractor, comments, styles, strings, stream);
+
+            xhtml.endElement("tbody");
+            xhtml.endElement("table");
+
+            // Output any headers and footers
+            // (Need to process the sheet to get them, so we can't
+            //  do the headers before the contents)
+            for (String header : sheetExtractor.headers) {
+                extractHeaderFooter(header, xhtml);
+            }
+            for (String footer : sheetExtractor.footers) {
+                extractHeaderFooter(footer, xhtml);
+            }
+            List<XSSFShape> shapes = iter.getShapes();
+            processShapes(shapes, xhtml);
+
+            //for now dump sheet hyperlinks at bottom of page
+            //consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes
+            //step 1: extract hyperlink info from bottom of page
+            //step 2: process as we do now, but with cached hyperlink relationship info
+            extractHyperLinks(sheetPart, xhtml);
+            // All done with this sheet
+            xhtml.endElement("div");
+        }
+    }
+
+    @Override
+    protected void extractHeaderFooter(String hf, XHTMLContentHandler xhtml)
+            throws SAXException {
+        if (hf.length() > 0) {
+            xhtml.element("p", hf);
+        }
+    }
+
+    private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException {
+        try {
+            for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
+                xhtml.startElement("a", "href", rel.getTargetURI().toString());
+                xhtml.characters(rel.getTargetURI().toString());
+                xhtml.endElement("a");
+            }
+        } catch (InvalidFormatException e) {
+            //swallow
+        }
+    }
+
+    private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
+        if (shapes == null) {
+            return;
+        }
+        for (XSSFShape shape : shapes) {
+            if (shape instanceof XSSFSimpleShape) {
+                String sText = ((XSSFSimpleShape) shape).getText();
+                if (sText != null && sText.length() > 0) {
+                    xhtml.element("p", sText);
+                }
+                extractHyperLinksFromShape(((XSSFSimpleShape)shape).getCTShape(), xhtml);
+            }
+        }
+    }
+
+    private void extractHyperLinksFromShape(CTShape ctShape, XHTMLContentHandler xhtml) throws SAXException {
+
+        if (ctShape == null)
+            return;
+
+        CTShapeNonVisual nvSpPR = ctShape.getNvSpPr();
+        if (nvSpPR == null)
+            return;
+
+        CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr();
+        if (cNvPr == null)
+            return;
+
+        CTHyperlink ctHyperlink = cNvPr.getHlinkClick();
+        if (ctHyperlink == null)
+            return;
+
+        String url = drawingHyperlinks.get(ctHyperlink.getId());
+        if (url != null) {
+            xhtml.startElement("a", "href", url);
+            xhtml.characters(url);
+            xhtml.endElement("a");
+        }
+
+        CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover();
+        if (ctHoverHyperlink == null)
+            return;
+
+        url = drawingHyperlinks.get(ctHoverHyperlink.getId());
+        if (url != null) {
+            xhtml.startElement("a", "href", url);
+            xhtml.characters(url);
+            xhtml.endElement("a");
+        }
+
+    }
+
+    private void processSheet(
+            SheetContentsHandler sheetContentsExtractor,
+            XSSFBCommentsTable comments,
+            XSSFBStylesTable styles,
+            XSSFBSharedStringsTable strings,
+            InputStream sheetInputStream)
+            throws IOException, SAXException {
+
+        XSSFBSheetHandler xssfbSheetHandler = new XSSFBSheetHandler(
+                sheetInputStream,
+                styles,
+                comments,
+                strings,
+                sheetContentsExtractor,
+                formatter,
+                false
+        );
+        xssfbSheetHandler.parse();
+    }
+
+    /**
+     * In Excel files, sheets have things embedded in them,
+     * and sheet drawings which have the images
+     */
+    @Override
+    protected List<PackagePart> getMainDocumentParts() throws TikaException {
+        List<PackagePart> parts = new ArrayList<PackagePart>();
+        for (PackagePart part : sheetParts) {
+            // Add the sheet
+            parts.add(part);
+
+            // If it has drawings, return those too
+            try {
+                for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
+                    if (rel.getTargetMode() == TargetMode.INTERNAL) {
+                        PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
+                        parts.add(rel.getPackage().getPart(relName));
+                    }
+                }
+                for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
+                    if (rel.getTargetMode() == TargetMode.INTERNAL) {
+                        PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
+                        parts.add(rel.getPackage().getPart(relName));
+                    }
+                }
+            } catch (InvalidFormatException e) {
+                throw new TikaException("Broken OOXML file", e);
+            }
+        }
+
+        //add main document so that macros can be extracted
+        //by AbstractOOXMLExtractor
+        for (PackagePart part : extractor.getPackage().
+                getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
+            parts.add(part);
+        }
+
+        return parts;
+    }
+}
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index a8bee1e..dbf21d1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -71,7 +71,6 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
      * Allows access to headers/footers from raw xml strings
      */
     protected static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
-    private final XSSFEventBasedExcelExtractor extractor;
     protected final DataFormatter formatter;
     protected final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
     protected final Map<String, String> drawingHyperlinks = new HashMap<>();
@@ -84,9 +83,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
 
         this.parseContext = context;
         this.extractor = (XSSFEventBasedExcelExtractor)extractor;
-        // not yet supported in POI-3.16-beta3
-        // this.extractor.setFormulasNotResults(false);
-        this.extractor.setLocale(locale);
+        configureExtractor(this.extractor, locale);
 
         if (locale == null) {
             formatter = new TikaExcelDataFormatter();
@@ -95,6 +92,11 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
         }
     }
 
+    protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
+        ((XSSFEventBasedExcelExtractor)extractor).setFormulasNotResults(false);
+        ((XSSFEventBasedExcelExtractor)extractor).setLocale(locale);
+    }
+
     @Override
     public void getXHTML(
             ContentHandler handler, Metadata metadata, ParseContext context)
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index cea5e9f..fc31958 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -267,44 +267,6 @@ public class ExcelParserTest extends TikaTest {
         }
     }
 
-    /**
-     * We don't currently support the .xlsb file format 
-     *  (an OOXML container with binary blobs), but we 
-     *  shouldn't break on these files either (TIKA-826)  
-     */
-    @Test
-    public void testExcelXLSB() throws Exception {
-        Detector detector = new DefaultDetector();
-        AutoDetectParser parser = new AutoDetectParser();
-
-        Metadata m = new Metadata();
-        m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
-
-        // Should be detected correctly
-        MediaType type;
-        try (InputStream input = ExcelParserTest.class.getResourceAsStream(
-                "/test-documents/testEXCEL.xlsb")) {
-            type = detector.detect(input, m);
-            assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
-        }
-
-        // OfficeParser won't handle it
-        assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
-
-        // OOXMLParser will (soon) handle it
-        assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
-
-        // AutoDetectParser doesn't break on it
-        try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
-            ContentHandler handler = new BodyContentHandler(-1);
-            ParseContext context = new ParseContext();
-            context.set(Locale.class, Locale.US);
-            parser.parse(input, handler, m, context);
-
-            String content = handler.toString();
-            assertEquals("", content);
-        }
-    }
 
     /**
      * Excel 5 and 95 are older formats, and only get basic support
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index af1ba27..6420545 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -27,7 +27,6 @@ import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
 import java.io.ByteArrayOutputStream;
-import java.io.EOFException;
 import java.io.File;
 import java.io.InputStream;
 import java.io.PrintStream;
@@ -43,8 +42,9 @@ import java.util.Map;
 import org.apache.poi.util.LocaleUtil;
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -52,19 +52,21 @@ import org.apache.tika.metadata.OfficeOpenXMLCore;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.microsoft.ExcelParserTest;
+import org.apache.tika.parser.microsoft.OfficeParser;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.WordParserTest;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
 
 public class OOXMLParserTest extends TikaTest {
 
@@ -1430,10 +1432,43 @@ public class OOXMLParserTest extends TikaTest {
     }
 
     @Test
-    @Ignore("until poi-3.16-beta3")
+    public void testExcelXLSB() throws Exception {
+        Detector detector = new DefaultDetector();
+        AutoDetectParser parser = new AutoDetectParser();
+
+        Metadata m = new Metadata();
+        m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
+
+        // Should be detected correctly
+        MediaType type;
+        try (InputStream input = ExcelParserTest.class.getResourceAsStream(
+                "/test-documents/testEXCEL.xlsb")) {
+            type = detector.detect(input, m);
+            assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
+        }
+
+        // OfficeParser won't handle it
+        assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+        // OOXMLParser will (soon) handle it
+        assertTrue((new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+        // AutoDetectParser doesn't break on it
+        try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
+            ContentHandler handler = new BodyContentHandler(-1);
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            parser.parse(input, handler, m, context);
+
+            String content = handler.toString();
+            assertContains("This is an example spreadsheet", content);
+        }
+    }
+
+    @Test
     public void testXLSBVarious() throws Exception {
-        //make sure to turn MACROs on, after we turn them off by default
         OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setExtractMacros(true);
         ParseContext parseContext = new ParseContext();
         parseContext.set(OfficeParserConfig.class, officeParserConfig);
         List<Metadata> metadataList = getRecursiveMetadata("testEXCEL_various.xlsb", parseContext);
@@ -1473,22 +1508,8 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("OddLeftFooter OddCenterFooter OddRightFooter", xml);
         assertContains("EvenLeftFooter EvenCenterFooter EvenRightFooter", xml);
         assertContains("FirstPageLeftFooter FirstPageCenterFooter FirstPageRightFooter", xml);
-    }
 
-    @Test
-    public void testTruncated() throws Exception {
-        Parser p = new AutoDetectParser();
-        ContentHandler handler = new DefaultHandler();
-        Metadata metadata = new Metadata();
-        ParseContext parseContext = new ParseContext();
-        try (InputStream is = getTestDocument("testWORD_truncated.docx")) {
-            p.parse(is, handler, metadata, parseContext);
-            fail("should have thrown an EOF exception?!");
-        } catch (TikaException e) {
-            Throwable cause = e.getCause();
-            assertTrue(cause instanceof EOFException);
-            assertEquals("application/x-tika-ooxml", metadata.get(Metadata.CONTENT_TYPE));
-        }
+
     }
 
 }

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.

[tika] 01/02: mock parser's uninterruptible sleep can happen to pause for exactly 3000 millis

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit a970303966072be65a65ddf2809b47fdd0db27b6
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Apr 19 09:20:43 2017 -0400

    mock parser's uninterruptible sleep can happen to pause for exactly 3000 millis
---
 .../src/test/java/org/apache/tika/parser/mock/MockParserTest.java       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
index 3d58b40..4adf9e7 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
@@ -192,7 +192,7 @@ public class MockParserTest extends TikaTest {
             //swallow
         }
         long elapsed = new Date().getTime()-start;
-        boolean longEnough = elapsed > 3000;//the xml file specifies 3000, this sleeps 1000
+        boolean longEnough = elapsed >= 3000;//the xml file specifies 3000, this sleeps 1000
         assertTrue("elapsed ("+elapsed+" millis) was not long enough", longEnough);
     }
 

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.