You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/06/29 14:10:07 UTC

[tika] branch TIKA-3164-1.x updated (7aa2732 -> bc9cca8)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-3164-1.x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 7aa2732  TIKA-3164 -- WIP POI 5.0.0
     add 3776e48  TIKA-3244 upgrade jackcess with upgrade to most recent jackcess-crypt; fix npe while at it.
     add f64cbed  TIKA-3244 -- general upgrades for 1.26
     add 2550aac  Merge remote-tracking branch 'origin/branch_1x' into branch_1x
     add 617a33b  TIKA-3244 update jackcess to continue on null table; make build work for Java > 8.
     add de548d4  TIKA-3244: update mockito
     add 1dc1583  Merge remote-tracking branch 'origin/branch_1x' into branch_1x
     add 5a0df92  Update CHANGES for 1.26 release
     add 012c54e  Update CHANGES for 1.26 release
     add 8053f70  [maven-release-plugin] prepare release 1.26-rc1
     add 2fea0b0  [maven-release-plugin] prepare for next development iteration
     add d786d95  rollback for 1.26-SNAPSHOT development
     add faaaca7  add timeout threshold for fileprofiler
     add 2475d07  TIKA-3314
     add 9f059bd  TIKA-3315 add xerces to tika-eval
     add afea29c  TIKA-3316 -- improve processing of XPS files
     add f5a2aed  TIKA-3320 -- allow case-insensitive header request matching for pdf/ocr config in tika-server
     add f5eebe2  TIKA-3244: update snappy-java
     add 5a7a4e7  TIKA-3244: update lombok, twelvemonkeys, jackson
     add ed07a6e  TIKA-3244: update tukaani
     add 22cdffb  TIKA-3244: update zstd
     add 52b18f5  TIKA-3244: update commons-lang
     add d615710  TIKA-3244: update spring
     add 02ed830  TIKA-3244: update pax-url-aether
     add 8081e6d  TIKA-3318 Document the units of xmpDM:duration as seconds by default
     add 21b3cf8  TIKA-3318 MP3 parser should output the xmpDM:duration metadata as seconds not milliseconds
     add a4c9257  Changelog update
     add b0242ee  Backport to 1.x - TIKA-3310 Check if MP4 file's compatible brands match any of the expected values, from Peter Kronenberg
     add a1ec3fd  Changelog update
     add 7ff6a49  Merge remote-tracking branch 'origin/branch_1x' into branch_1x
     add 2e5b822  TIKA-3323 -- allow flexibility for 'file' command output on different operating systems
     add e6bf840  Minor restructure and added missing javadoc (#416)
     add 4f24ca5  TIKA-3327 Simple server metrics monitoring (#417)
     add 32caedd  TIKA-3327 Added missing locale (#418)
     add 8bf65c0  TIKA-3332 -- recursively process the embedded file tree in PDFs.
     add ecf502a  TIKA-3325 -- writeLimit is now calculated on the full file (container and embedded documents), no longer on each.
     add 88166ce  TIKA-3331 -- return a more informative exception for encrypted ODT
     add 7041f96  remove ProgressMonitorInputStream because it conceals the underlying TikaInputStream and wreaks havoc with reset and using underlying files, etc...
     add 21f428d   TIKA-3322 -- upgrade PDFBox to 2.0.23
     add 6a27f3e  TIKA-3244: update spring
     add b63072c  Merge remote-tracking branch 'origin/branch_1x' into branch_1x
     add b29cce5  TIKA-3244 -- general upgrades for 1.26
     add b1e8641  TIKA-3335 -- handle bad xml more robustly when checking for encryption
     add 2b8c9a3  TIKA-3336 -- new zip bombs detect in 1.26-SNAPSHOT compared with 1.25 -- bug, don't advance twice per call to chars/whitespace
     add da05576  TIKA-3334 -- fix thread safety bug in handling embedded docs in open office parser
     add 8c21fba  Update CHANGES.txt for 1.26 release
     add 1842758  fix rat and imports for 1.26 release
     add 2e83fd4  [maven-release-plugin] prepare release 1.26-rc1
     add 1d09e95  [maven-release-plugin] prepare for next development iteration
     add 74290b3  TIKA-3350
     add 79d5448  TIKA-3244: update commons-net
     add 7f83539  [TIKA-3344] [TIKA-3345] (#422)
     add f51a905  Merge remote-tracking branch 'origin/branch_1x' into branch_1x
     add 2006dc5  TIKA-3352: Add json output for /tika endpoint in tika-server
     add d3ed76f  TIKA-3355 -- integrate fakeload library into MockParser
     add 9f58c30  TIKA-3355 -- include dependencies in test-jar
     add f58b01f  TIKA-3196 -- multithreading issue in package parser
     add 82f7a34  [TIKA-3357] removes ambiguity by choosing handler based on produce type (#427)
     add 3fa1953  TIKA-3357 -- add unit test
     add 2704f0e  TIKA-3374 add encoding detection to zip entry names via Ryan Liu.
     add cd191ff  update changes for new development in 1.27...please may it never happen... 2.0.0 here we come!
     add 34fb775  Merge remote-tracking branch 'origin/branch_1x' into branch_1x
     add 64cdc8e  [TIKA-3353] Prometheus and JMX monitoring over micrometer (#429)
     add 6f2f373  Merge remote-tracking branch 'origin/branch_1x' into branch_1x
     add bc551b1  TIKA-3372 -- fix write limit in recursive parser handler
     add f7d5119  TIKA-3373 Add the *.yml extension for YAML, which is commonly used, along with aliases for popular alternate mimetypes for it
     add f414fe4  Merge remote-tracking branch 'origin/branch_1x' into branch_1x
     add 64b2e4b  Merge remote-tracking branch 'origin/branch_1x' into branch_1x
     add 32545d4  TIKA-3376 improve handling of write limit reached in json output from /tika endpoint
     add 509979b  fix logic in ExtractComparer
     add 2f80958  TIKA-3372 -- fix write limit handling in the PDFParser
     add 7cae30e  TIKA-3374 -- allow users to turn off charset detection of entry names in ZipArchives
     add 2c4ba60  Backport Merge branch 'TIKA-3329' of https://github.com/thammegowda/tika into main
     add b2f37ef  Forbidden API error fixes.
     add e281b4d  avoid doubling content in the RecursiveParserWrapper by sending in a new Metadata object to the embedded parser handler
     add 2186e67  TIKA-3382 -- improve writelimitreached handling in numerous parsers
     add ba72c86  TIKA-3382 -- improve writelimitreached handling in numerous parsers -- clean up PDFParser
     new ab86545  Merge remote-tracking branch 'origin/branch_1x' into TIKA-3164-1.x
     add b7e53e4  TIKA-3386 allow a times parameter in MockParser
     add 6efea4f  TIKA-3386 -- fix conflicts
     add 06c111f  TIKA-3392 -- allow insecure parsing in MimeTypesReader
     add c6998bd  TIKA-3405 -- fix and upgrade isoparser
     add 499e1c9  TIKA-3407 -- fix jaxb version and clean up related issues
     add fd98eee  TIKA-3441 -- prevent infinite loop on failure to bind to a port
     add d7fa2cd  TIKA-3441 -- improve likelihood that tesseract processes will be shutdown on crash.
     add 27d0e3b  TIKA-3433 -- extractAnnotationText should be settable via tikaconfig
     add 6a9e726  TIKA-3437 -- deprecate experimental PDFPreflightParser
     add 10c94ff  fix for change in mp4parser behavior
     add e8ec223  Merge remote-tracking branch 'origin/branch_1x' into branch_1x
     add 1224f88  TIKA-3441 -- improve likelihood that tesseract processes will be shutdown on jvm restart.
     add 90c6ea4  TIKA-3444 -- upgrade to pdfbox 2.0.24
     add 4ba5fd7  TIKA-3456 -- LanguageDetector should chunk long strings and test for hasEnoughText.
     add 57f5912  TIKA-3457 -- general upgrades for 1.27
     new 702cf3e  Merge remote-tracking branch 'origin/branch_1x' into TIKA-3164-1.x
     new bc9cca8  TIKA-3164 -- further tweaks

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |   42 +-
 NOTICE.txt                                         |   24 +
 pom.xml                                            |    2 +-
 tika-app/pom.xml                                   |    2 +-
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |    2 +-
 .../src/main/java/org/apache/tika/gui/TikaGUI.java |    3 -
 tika-batch/pom.xml                                 |    2 +-
 .../batch/fs/RecursiveParserWrapperFSConsumer.java |    2 +-
 tika-bundle/pom.xml                                |   46 +-
 tika-core/pom.xml                                  |  377 ++++---
 .../main/assembly/test-jar-with-dependencies.xml   |   38 +
 ...TikaException.java => RuntimeSAXException.java} |   15 +-
 .../tika/exception/WriteLimitReachedException.java |   88 ++
 .../tika/language/detect/LanguageDetector.java     |   21 +-
 .../main/java/org/apache/tika/metadata/XMPDM.java  |    1 +
 .../java/org/apache/tika/mime/MimeTypesReader.java |   18 +-
 .../tika/parser/AbstractExternalProcessParser.java |   57 +
 .../org/apache/tika/parser/CompositeParser.java    |    4 +
 .../apache/tika/parser/RecursiveParserWrapper.java |  120 ++-
 .../sax/AbstractRecursiveParserWrapperHandler.java |   11 +-
 .../tika/sax/BasicContentHandlerFactory.java       |    3 +
 .../tika/sax/RecursiveParserWrapperHandler.java    |    9 +-
 .../org/apache/tika/sax/SecureContentHandler.java  |    2 +-
 .../org/apache/tika/mime/tika-mimetypes.xml        |    7 +
 .../src/test/java/org/apache/tika/TikaTest.java    |   15 +-
 .../tika/detect/FileCommandDetectorTest.java       |   40 +-
 .../org/apache/tika/parser/mock/MockParser.java    |  176 +++-
 .../apache/tika/parser/mock/MockParserTest.java}   |   29 +-
 .../resources/test-documents/mock_fakeload.xml     |    8 +-
 .../test/resources/test-documents/mock_times.xml   |    5 +-
 tika-dl/pom.xml                                    |   34 +-
 tika-eval/pom.xml                                  |   59 +-
 .../java/org/apache/tika/eval/ExtractComparer.java |    2 +-
 .../java/org/apache/tika/eval/FileProfiler.java    |   25 +-
 .../src/main/resources/lucene-char-mapping.txt     |    3 +-
 .../resources/tika-eval-file-profiler-config.xml   |   10 +-
 tika-example/pom.xml                               |    8 +-
 tika-fuzzing/pom.xml                               |    2 +-
 tika-java7/pom.xml                                 |    2 +-
 tika-langdetect/pom.xml                            |    8 +-
 tika-nlp/pom.xml                                   |    6 +-
 tika-parent/pom.xml                                | 1108 ++++++++++----------
 tika-parsers/pom.xml                               |   87 +-
 .../apache/tika/parser/asm/XHTMLClassVisitor.java  |   17 +-
 .../java/org/apache/tika/parser/chm/ChmParser.java |    5 +-
 .../org/apache/tika/parser/crypto/Pkcs7Parser.java |    4 +-
 .../org/apache/tika/parser/crypto/TSDParser.java   |   11 +-
 .../tika/parser/microsoft/JackcessExtractor.java   |    7 +
 .../tika/parser/microsoft/JackcessParser.java      |    2 +-
 .../tika/parser/microsoft/SummaryExtractor.java    |    1 -
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |   12 +-
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |   18 +-
 .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java  |  356 +++----
 .../ooxml/OOXMLWordAndPowerPointTextHandler.java   |   50 +-
 .../ooxml/XSSFExcelExtractorDecorator.java         |   11 +-
 .../microsoft/ooxml/xps/XPSExtractorDecorator.java |    5 +
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java    |    9 +
 .../microsoft/xml/AbstractXML2003Parser.java       |    5 +-
 .../java/org/apache/tika/parser/mp3/Mp3Parser.java |   13 +-
 .../java/org/apache/tika/parser/mp4/MP4Parser.java |   26 +-
 .../apache/tika/parser/ocr/TesseractOCRParser.java |   99 +-
 ...ndler.java => OpenDocumentManifestHandler.java} |   54 +-
 .../apache/tika/parser/odf/OpenDocumentParser.java |  110 +-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |   77 +-
 .../java/org/apache/tika/parser/pdf/PDF2XHTML.java |    2 +
 .../java/org/apache/tika/parser/pdf/PDFParser.java |    3 +-
 .../apache/tika/parser/pdf/PDFPreflightParser.java |    5 +
 .../org/apache/tika/parser/pkg/PackageParser.java  |   59 +-
 .../parser/pkg/StreamingZipContainerDetector.java  |  171 +--
 .../tika/parser/pkg/ZipContainerDetector.java      |   18 +-
 .../org/apache/tika/parser/utils/ZipSalvager.java  |   96 +-
 .../tika/config/TikaEncodingDetectorTest.java      |    6 +-
 .../tika/detect/TestContainerAwareDetector.java    |    2 +-
 .../tika/parser/RecursiveParserWrapperTest.java    |   15 +-
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |    2 -
 .../parser/microsoft/ooxml/xps/XPSParserTest.java  |   45 +
 .../org/apache/tika/parser/mp3/Mp3ParserTest.java  |    6 +-
 .../org/apache/tika/parser/mp4/MP4ParserTest.java  |   13 +-
 .../org/apache/tika/parser/odf/ODFParserTest.java  |   68 ++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |   24 +
 .../apache/tika/parser/pkg/PackageParserTest.java  |   13 +-
 .../tika/parser/pkg/ZipContainerDetectorTest.java  |    2 +-
 tika-parsers/src/test/resources/log4j.properties   |    2 +-
 .../pdf/tika-skip-annotations-config.xml}          |   10 +-
 .../src/test/resources/test-documents/gbk.zip      |  Bin 0 -> 432 bytes
 .../resources/test-documents/testODTEncrypted.odt  |  Bin 0 -> 12714 bytes
 .../testPDF_deeplyEmbeddedAttachments.pdf          |  Bin 0 -> 122221 bytes
 .../test-documents/testXPSWithDataDescriptor.xps   |  Bin 0 -> 44523 bytes
 .../test-documents/testXPSWithDataDescriptor2.xps  |  Bin 0 -> 51175 bytes
 tika-serialization/pom.xml                         |    2 +-
 tika-server/pom.xml                                |   52 +-
 .../tika/server/ProduceTypeResourceComparator.java |  145 +++
 .../java/org/apache/tika/server/TikaServerCli.java |   95 +-
 .../org/apache/tika/server/TikaServerWatchDog.java |   74 +-
 .../org/apache/tika/server/mbean/MBeanHelper.java  |   60 ++
 .../tika/server/mbean/ServerStatusExporter.java    |   83 ++
 .../server/mbean/ServerStatusExporterMBean.java    |   49 +-
 .../apache/tika/server/metrics/Log4JMetrics.java   |  212 ++++
 .../apache/tika/server/metrics/MetricsHelper.java  |  220 ++++
 .../tika/server/metrics/MetricsResource.java       |   53 +
 .../tika/server/metrics/ServerStatusMetrics.java   |   61 ++
 .../server/resource/RecursiveMetadataResource.java |    8 +-
 .../apache/tika/server/resource/TikaResource.java  |  206 +++-
 .../tika/server/resource/TikaServerStatus.java     |   21 +-
 .../java/org/apache/tika/server/CXFTestBase.java   |   20 +-
 .../apache/tika/server/MetricsResourceTest.java    |  118 +++
 .../tika/server/RecursiveMetadataResourceTest.java |  143 ++-
 .../org/apache/tika/server/StackTraceTest.java     |   17 +-
 .../server/TikaResourceMetadataFilterTest.java     |   83 ++
 .../tika/server/TikaResourceNoStackTest.java       |   98 ++
 .../org/apache/tika/server/TikaResourceTest.java   |  322 +++++-
 .../tika/server/TikaServerIntegrationTest.java     |   55 +
 .../resources/configs/metadata-filter-include.xml  |   14 +-
 .../mock/{fake_oom.xml => hello_world.xml}         |    4 +-
 .../mock/{fake_oom.xml => hello_world_long.xml}    |    8 +-
 tika-translate/pom.xml                             |   45 +-
 .../tika/language/translate/RTGTranslator.java     |  143 +++
 .../org.apache.tika.language.translate.Translator  |    1 +
 ...eTranslatorTest.java => RTGTranslatorTest.java} |   47 +-
 tika-xmp/pom.xml                                   |    2 +-
 120 files changed, 4708 insertions(+), 1730 deletions(-)
 create mode 100644 tika-core/src/main/assembly/test-jar-with-dependencies.xml
 copy tika-core/src/main/java/org/apache/tika/exception/{TikaException.java => RuntimeSAXException.java} (77%)
 create mode 100644 tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
 create mode 100644 tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java
 copy tika-core/src/{main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java => test/java/org/apache/tika/parser/mock/MockParserTest.java} (58%)
 copy tika-parsers/src/test/resources/test-documents/mock/fake_oom.xml => tika-core/src/test/resources/test-documents/mock_fakeload.xml (73%)
 copy tika-batch/src/test/resources/test-input/max_restarts/test1_oom.xml => tika-core/src/test/resources/test-documents/mock_times.xml (86%)
 copy tika-parsers/src/main/java/org/apache/tika/parser/odf/{OpenDocumentMacroHandler.java => OpenDocumentManifestHandler.java} (50%)
 copy tika-parsers/src/test/resources/org/apache/tika/{config/TIKA-1558-excludesub.xml => parser/pdf/tika-skip-annotations-config.xml} (78%)
 create mode 100644 tika-parsers/src/test/resources/test-documents/gbk.zip
 create mode 100644 tika-parsers/src/test/resources/test-documents/testODTEncrypted.odt
 create mode 100644 tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf
 create mode 100644 tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor.xps
 create mode 100644 tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps
 create mode 100644 tika-server/src/main/java/org/apache/tika/server/ProduceTypeResourceComparator.java
 create mode 100644 tika-server/src/main/java/org/apache/tika/server/mbean/MBeanHelper.java
 create mode 100644 tika-server/src/main/java/org/apache/tika/server/mbean/ServerStatusExporter.java
 copy tika-core/src/main/java/org/apache/tika/metadata/Geographic.java => tika-server/src/main/java/org/apache/tika/server/mbean/ServerStatusExporterMBean.java (50%)
 create mode 100644 tika-server/src/main/java/org/apache/tika/server/metrics/Log4JMetrics.java
 create mode 100644 tika-server/src/main/java/org/apache/tika/server/metrics/MetricsHelper.java
 create mode 100644 tika-server/src/main/java/org/apache/tika/server/metrics/MetricsResource.java
 create mode 100644 tika-server/src/main/java/org/apache/tika/server/metrics/ServerStatusMetrics.java
 create mode 100644 tika-server/src/test/java/org/apache/tika/server/MetricsResourceTest.java
 create mode 100644 tika-server/src/test/java/org/apache/tika/server/TikaResourceMetadataFilterTest.java
 create mode 100644 tika-server/src/test/java/org/apache/tika/server/TikaResourceNoStackTest.java
 copy tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-bad-parameters.xml => tika-server/src/test/resources/configs/metadata-filter-include.xml (68%)
 copy tika-server/src/test/resources/mock/{fake_oom.xml => hello_world.xml} (83%)
 copy tika-server/src/test/resources/mock/{fake_oom.xml => hello_world_long.xml} (58%)
 create mode 100644 tika-translate/src/main/java/org/apache/tika/language/translate/RTGTranslator.java
 copy tika-translate/src/test/java/org/apache/tika/language/translate/{GoogleTranslatorTest.java => RTGTranslatorTest.java} (62%)

[tika] 03/03: TIKA-3164 -- further tweaks

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3164-1.x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit bc9cca8761d20bd9a9f2f54ebcaf89ba093e2c82
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jun 29 10:09:41 2021 -0400

    TIKA-3164 -- further tweaks
---
 tika-bundle/pom.xml                                | 23 ++++++++++++++++++-
 tika-eval/pom.xml                                  | 26 +++++++++++++++++++++-
 tika-parsers/pom.xml                               | 20 +++++++++++++++++
 .../tika/parser/microsoft/SummaryExtractor.java    |  1 -
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |  1 -
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |  9 +++-----
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  2 --
 tika-parsers/src/test/resources/log4j.properties   |  2 +-
 .../apache/tika/server/resource/TikaResource.java  |  2 +-
 9 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index c90cc85..aecfb25 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -203,7 +203,8 @@
               bcpkix-jdk15on|
               poi|poi-scratchpad|
               poi-ooxml|
-              poi-ooxml-schemas|
+              poi-ooxml-lite|
+              log4j-api|
               commons-math3|
               curvesapi|
               xmlbeans|
@@ -266,6 +267,7 @@
 	      !org.apache.spark.ml.*,
 	      !org.apache.spark.mllib.*,
 	      !org.apache.spark.sql.*,
+              !com.github.javaparser.*,
 	          org.apache.tika.mime,
               org.apache.tika.fork,
               android.util;resolution:=optional,
@@ -287,11 +289,14 @@
               com.parso;resolution:=optional,
               com.sleepycat.je;resolution:=optional,
               com.sun.javadoc;resolution:=optional,
+              com.sun.org.apache.xml.internal.resolver;resolution:=optional,
+              com.sun.org.apache.xml.internal.resolver.tools;resolution:=optional,
               com.sun.xml.bind.marshaller;resolution:=optional,
               com.sun.xml.internal.bind.marshaller;resolution:=optional,
               com.sun.msv.datatype;resolution:=optional,
               com.sun.msv.datatype.xsd;resolution:=optional,
               com.sun.tools.javadoc;resolution:=optional,
+              de.rototor.pdfbox.graphics2d;resolution:=optional,
               edu.mit.ll.mitie;resolution:=optional,
               edu.stanford.nlp.*;resolution:=optional,
               edu.wisc.ssec.mcidas;resolution:=optional,
@@ -324,15 +329,22 @@
               net.didion.jwnl;resolution:=optional,
               net.sf.saxon;resolution:=optional,
               net.sf.saxon.dom;resolution:=optional,
+              net.sf.saxon.lib;resolution:=optional,
+              net.sf.saxon.ma.map;resolution:=optional,
               net.sf.saxon.om;resolution:=optional,
               net.sf.saxon.query;resolution:=optional,
               net.sf.saxon.sxpath;resolution:=optional,
+              net.sf.saxon.trans;resolution:=optional,
+              net.sf.saxon.tree.wrapper;resolution:=optional,
+              net.sf.saxon.type;resolution:=optional,
               net.sf.saxon.value;resolution:=optional,
               org.apache.batik.anim.dom;resolution:=optional,
               org.apache.batik.bridge;resolution:=optional,
+              org.apache.batik.dom;resolution:=optional,
               org.apache.batik.ext.awt;resolution:=optional,
               org.apache.batik.ext.awt.image.renderable;resolution:=optional,
               org.apache.batik.gvt;resolution:=optional,
+              org.apache.batik.svggen;resolution:=optional,
               org.apache.batik.util;resolution:=optional,
               org.apache.cxf.jaxrs.client;resolution:=optional,
               org.apache.cxf.jaxrs.ext.multipart;resolution:=optional,
@@ -349,6 +361,14 @@
               org.apache.commons.vfs2.util;resolution:=optional,
               org.apache.crimson.jaxp;resolution:=optional,
               org.apache.jcp.xml.dsig.internal.dom;resolution:=optional,
+              org.apache.logging.log4j;resolution:=optional,
+              org.apache.logging.log4j.message;resolution:=optional,
+              org.apache.logging.log4j.util;resolution:=optional,
+              org.apache.logging.log4j.util.internal;resolution:=optional,
+              org.apache.maven.model;resolution:=optional,
+              org.apache.maven.plugin;resolution:=optional,
+              org.apache.maven.plugin.logging;resolution:=optional,
+              org.apache.maven.project;resolution:=optional,
               org.apache.pdfbox.debugger;resolution:=optional,
               org.apache.pdfbox.preflight.*;resolution:=optional,
               org.apache.sis;resolution:=optional,
@@ -438,6 +458,7 @@
               org.slf4j.helpers;resolution:=optional,
               org.sqlite;resolution:=optional,
               org.w3c.dom;resolution:=optional,
+              org.w3c.dom.traversal;resolution:=optional,
               org.relaxng.datatype;resolution:=optional,
               org.xml.sax;resolution:=optional,
               org.xml.sax.ext;resolution:=optional,
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index d86c365..08359bf 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -124,11 +124,35 @@
                     <groupId>org.apache.commons</groupId>
                     <artifactId>commons-compress</artifactId>
                 </exclusion>
+                <exclusion>
+                    <groupId>org.apache.xmlgraphics</groupId>
+                    <artifactId>batik-all</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.xmlgraphics</groupId>
+                    <artifactId>batik-bridge</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.xmlgraphics</groupId>
+                    <artifactId>batik-svggen</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.xmlgraphics</groupId>
+                    <artifactId>batik-codec</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.pdfbox</groupId>
+                    <artifactId>pdfbox</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>jakarta.xml.bind</groupId>
+                    <artifactId>jakarta.xml.bind-api</artifactId>
+                </exclusion>
             </exclusions>
         </dependency>
         <dependency>
             <groupId>org.apache.poi</groupId>
-            <artifactId>poi-ooxml-schemas</artifactId>
+            <artifactId>poi-ooxml-lite</artifactId>
             <version>${poi.version}</version>
         </dependency>
         <dependency>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index cb0e473..693515e 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -275,6 +275,26 @@
           <groupId>org.apache.xmlgraphics</groupId>
           <artifactId>batik-all</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.xmlgraphics</groupId>
+          <artifactId>batik-bridge</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.xmlgraphics</groupId>
+          <artifactId>batik-svggen</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.xmlgraphics</groupId>
+          <artifactId>batik-codec</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.pdfbox</groupId>
+          <artifactId>pdfbox</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>jakarta.xml.bind</groupId>
+          <artifactId>jakarta.xml.bind-api</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
index ba98c0e..30c472d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
@@ -24,7 +24,6 @@ import java.util.Set;
 
 import org.apache.poi.hpsf.CustomProperties;
 import org.apache.poi.hpsf.DocumentSummaryInformation;
-import org.apache.poi.hpsf.MarkUnsupportedException;
 import org.apache.poi.hpsf.NoPropertySetStreamException;
 import org.apache.poi.hpsf.PropertySet;
 import org.apache.poi.hpsf.SummaryInformation;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index b404da7..ba43c31 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -380,7 +380,6 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
             throws SAXException, IOException {
         Metadata metadata = new Metadata();
         metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
-
         // Get the name
         String name = part.getPartName().getName();
         metadata.set(
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index ddc607a..fd265f5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -65,8 +65,6 @@ import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-import static org.apache.poi.ooxml.extractor.POIXMLExtractorFactory.setThreadPrefersEventExtractors;
-
 /**
  * Figures out the correct {@link OOXMLExtractor} for the supplied document and
  * returns it.
@@ -78,7 +76,7 @@ public class OOXMLExtractorFactory {
     private static POIXMLExtractorFactory EXTRACTOR_FACTORY = new POIXMLExtractorFactory();
 
     static {
-        setThreadPrefersEventExtractors(true);
+        POIXMLExtractorFactory.setAllThreadsPreferEventExtractors(true);
     }
 
     public static void parse(
@@ -176,7 +174,6 @@ public class OOXMLExtractorFactory {
             if (poiExtractor == null) {
                 poiExtractor = EXTRACTOR_FACTORY.create(pkg);
             }
-
             POIXMLDocument document = poiExtractor.getDocument();
             if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
                 extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
@@ -212,7 +209,6 @@ public class OOXMLExtractorFactory {
             // Get the bulk of the metadata first, so that it's accessible during
             //  parsing if desired by the client (see TIKA-1109)
             extractor.getMetadataExtractor().extract(metadata);
-
             // Extract the text, along with any in-document metadata
             extractor.getXHTML(baseHandler, metadata, context);
         } catch (IllegalArgumentException e) {
@@ -291,7 +287,8 @@ public class OOXMLExtractorFactory {
         //TODO make this static...or find what happened to SUPPORTED_TYPES
         XSLFRelation[] xslfRelations = new XSLFRelation[] {
                 XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
-                XSLFRelation.PRESENTATIONML_TEMPLATE
+                XSLFRelation.PRESENTATIONML,
+                XSLFRelation.PRESENTATIONML_TEMPLATE, XSLFRelation.PRESENTATION_MACRO
         };
 
         for (int i = 0; i < xslfRelations.length; i++) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index bdbc9e4..3e007b9 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -31,7 +31,6 @@ import java.io.File;
 import java.io.InputStream;
 import java.io.PrintStream;
 import java.io.StringWriter;
-import java.nio.file.Path;
 import java.text.DecimalFormatSymbols;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -43,7 +42,6 @@ import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-import org.apache.ctakes.typesystem.type.syntax.O;
 import org.apache.poi.util.LocaleUtil;
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
diff --git a/tika-parsers/src/test/resources/log4j.properties b/tika-parsers/src/test/resources/log4j.properties
index f2c0b92..d557c48 100644
--- a/tika-parsers/src/test/resources/log4j.properties
+++ b/tika-parsers/src/test/resources/log4j.properties
@@ -21,4 +21,4 @@ log4j.appender.stdout=org.apache.log4j.ConsoleAppender
 log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 
 # Pattern to output the caller's file name and line number.
-log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n
+log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) ----- %m%n
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index b326f7f..c77694d 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -22,7 +22,6 @@ import org.apache.commons.lang3.StringUtils;
 import org.apache.cxf.attachment.ContentDisposition;
 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
 import org.apache.cxf.jaxrs.impl.MetadataMap;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
 import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
@@ -55,6 +54,7 @@ import org.apache.tika.server.ServerStatus;
 import org.apache.tika.server.TikaServerParseException;
 import org.apache.tika.utils.ExceptionUtils;
 
+import org.apache.poi.extractor.ExtractorFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;

[tika] 02/03: Merge remote-tracking branch 'origin/branch_1x' into TIKA-3164-1.x

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3164-1.x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 702cf3e7101f500a6f176e03921573671504acd3
Merge: ab86545 57f5912
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jun 29 09:26:41 2021 -0400

    Merge remote-tracking branch 'origin/branch_1x' into TIKA-3164-1.x

 CHANGES.txt                                        |  8 ++
 tika-bundle/pom.xml                                | 17 +++-
 .../tika/language/detect/LanguageDetector.java     | 21 ++++-
 .../java/org/apache/tika/mime/MimeTypesReader.java | 18 +++-
 .../tika/parser/AbstractExternalProcessParser.java | 57 +++++++++++++
 .../org/apache/tika/parser/mock/MockParser.java    | 13 ++-
 .../apache/tika/parser/mock/MockParserTest.java    | 11 +++
 .../test/resources/test-documents/mock_times.xml   | 26 ++++++
 tika-dl/pom.xml                                    | 24 ++++++
 tika-example/pom.xml                               |  6 +-
 tika-langdetect/pom.xml                            | 10 ---
 tika-parent/pom.xml                                | 41 ++++-----
 tika-parsers/pom.xml                               | 56 +++---------
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 99 +++++++++++++---------
 .../java/org/apache/tika/parser/pdf/PDFParser.java |  1 +
 .../apache/tika/parser/pdf/PDFPreflightParser.java |  5 ++
 .../org/apache/tika/parser/mp4/MP4ParserTest.java  | 13 ++-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  | 13 +++
 .../parser/pdf/tika-skip-annotations-config.xml    | 28 ++++++
 tika-server/pom.xml                                | 66 +--------------
 .../java/org/apache/tika/server/TikaServerCli.java | 60 +++++++++----
 .../org/apache/tika/server/TikaServerWatchDog.java | 74 ++++++++++++----
 .../tika/server/TikaServerIntegrationTest.java     | 55 ++++++++++++
 tika-translate/pom.xml                             | 47 +---------
 24 files changed, 500 insertions(+), 269 deletions(-)

diff --cc tika-parent/pom.xml
index 71a5928,1569910..6d3fadc
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@@ -334,11 -334,11 +334,11 @@@
          <maven.shade.version>3.2.4</maven.shade.version>
          <rat.version>0.13</rat.version>
          <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
 -        <poi.version>4.1.2</poi.version>
 +        <poi.version>5.0.1-SNAPSHOT</poi.version>
          <commons.compress.version>1.20</commons.compress.version>
-         <commons.io.version>2.8.0</commons.io.version>
+         <commons.io.version>2.10.0</commons.io.version>
          <commons.lang3.version>3.12.0</commons.lang3.version>
-         <gson.version>2.8.6</gson.version>
+         <gson.version>2.8.7</gson.version>
          <guava.version>30.1.1-jre</guava.version>
          <osgi.core.version>6.0.0</osgi.core.version>
  

[tika] 01/03: Merge remote-tracking branch 'origin/branch_1x' into TIKA-3164-1.x

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3164-1.x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ab86545afec5cef3a05e546e4e667ea843a738ae
Merge: 7aa2732 ba72c86
Author: tallison <ta...@apache.org>
AuthorDate: Wed May 5 10:32:48 2021 -0400

    Merge remote-tracking branch 'origin/branch_1x' into TIKA-3164-1.x
    
    # Conflicts:
    #	tika-parent/pom.xml
    #	tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java

 CHANGES.txt                                        |   34 +-
 NOTICE.txt                                         |   24 +
 pom.xml                                            |    2 +-
 tika-app/pom.xml                                   |    2 +-
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |    2 +-
 .../src/main/java/org/apache/tika/gui/TikaGUI.java |    3 -
 tika-batch/pom.xml                                 |    2 +-
 .../batch/fs/RecursiveParserWrapperFSConsumer.java |    2 +-
 tika-bundle/pom.xml                                |    6 +-
 tika-core/pom.xml                                  |  377 ++++---
 .../main/assembly/test-jar-with-dependencies.xml   |   38 +
 .../apache/tika/exception/RuntimeSAXException.java |   29 +
 .../tika/exception/WriteLimitReachedException.java |   88 ++
 .../main/java/org/apache/tika/metadata/XMPDM.java  |    1 +
 .../org/apache/tika/parser/CompositeParser.java    |    4 +
 .../apache/tika/parser/RecursiveParserWrapper.java |  120 ++-
 .../sax/AbstractRecursiveParserWrapperHandler.java |   11 +-
 .../tika/sax/BasicContentHandlerFactory.java       |    3 +
 .../tika/sax/RecursiveParserWrapperHandler.java    |    9 +-
 .../org/apache/tika/sax/SecureContentHandler.java  |    2 +-
 .../org/apache/tika/mime/tika-mimetypes.xml        |    7 +
 .../src/test/java/org/apache/tika/TikaTest.java    |   15 +-
 .../tika/detect/FileCommandDetectorTest.java       |   40 +-
 .../org/apache/tika/parser/mock/MockParser.java    |  163 ++-
 .../apache/tika/parser/mock/MockParserTest.java    |   30 +
 .../resources/test-documents/mock_fakeload.xml     |   29 +
 tika-dl/pom.xml                                    |   10 +-
 tika-eval/pom.xml                                  |   33 +-
 .../java/org/apache/tika/eval/ExtractComparer.java |    2 +-
 .../java/org/apache/tika/eval/FileProfiler.java    |   25 +-
 .../src/main/resources/lucene-char-mapping.txt     |    3 +-
 .../resources/tika-eval-file-profiler-config.xml   |   10 +-
 tika-example/pom.xml                               |    4 +-
 tika-fuzzing/pom.xml                               |    2 +-
 tika-java7/pom.xml                                 |    2 +-
 tika-langdetect/pom.xml                            |    6 +-
 tika-nlp/pom.xml                                   |    6 +-
 tika-parent/pom.xml                                | 1103 ++++++++++----------
 tika-parsers/pom.xml                               |   37 +-
 .../apache/tika/parser/asm/XHTMLClassVisitor.java  |   17 +-
 .../java/org/apache/tika/parser/chm/ChmParser.java |    5 +-
 .../org/apache/tika/parser/crypto/Pkcs7Parser.java |    4 +-
 .../org/apache/tika/parser/crypto/TSDParser.java   |   11 +-
 .../tika/parser/microsoft/JackcessExtractor.java   |    7 +
 .../tika/parser/microsoft/JackcessParser.java      |    2 +-
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |   11 +-
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |    9 +-
 .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java  |  356 +++----
 .../ooxml/OOXMLWordAndPowerPointTextHandler.java   |   50 +-
 .../ooxml/XSSFExcelExtractorDecorator.java         |   11 +-
 .../microsoft/ooxml/xps/XPSExtractorDecorator.java |    5 +
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java    |    9 +
 .../microsoft/xml/AbstractXML2003Parser.java       |    5 +-
 .../java/org/apache/tika/parser/mp3/Mp3Parser.java |   13 +-
 .../java/org/apache/tika/parser/mp4/MP4Parser.java |   26 +-
 .../parser/odf/OpenDocumentManifestHandler.java    |   54 +
 .../apache/tika/parser/odf/OpenDocumentParser.java |  110 +-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |   77 +-
 .../java/org/apache/tika/parser/pdf/PDF2XHTML.java |    2 +
 .../java/org/apache/tika/parser/pdf/PDFParser.java |    2 +-
 .../org/apache/tika/parser/pkg/PackageParser.java  |   59 +-
 .../parser/pkg/StreamingZipContainerDetector.java  |  171 +--
 .../tika/parser/pkg/ZipContainerDetector.java      |   18 +-
 .../org/apache/tika/parser/utils/ZipSalvager.java  |   96 +-
 .../tika/config/TikaEncodingDetectorTest.java      |    6 +-
 .../tika/detect/TestContainerAwareDetector.java    |    2 +-
 .../tika/parser/RecursiveParserWrapperTest.java    |   15 +-
 .../parser/microsoft/ooxml/xps/XPSParserTest.java  |   45 +
 .../org/apache/tika/parser/mp3/Mp3ParserTest.java  |    6 +-
 .../org/apache/tika/parser/odf/ODFParserTest.java  |   68 ++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |   11 +
 .../apache/tika/parser/pkg/PackageParserTest.java  |   13 +-
 .../tika/parser/pkg/ZipContainerDetectorTest.java  |    2 +-
 .../src/test/resources/test-documents/gbk.zip      |  Bin 0 -> 432 bytes
 .../resources/test-documents/testODTEncrypted.odt  |  Bin 0 -> 12714 bytes
 .../testPDF_deeplyEmbeddedAttachments.pdf          |  Bin 0 -> 122221 bytes
 .../test-documents/testXPSWithDataDescriptor.xps   |  Bin 0 -> 44523 bytes
 .../test-documents/testXPSWithDataDescriptor2.xps  |  Bin 0 -> 51175 bytes
 tika-serialization/pom.xml                         |    2 +-
 tika-server/pom.xml                                |   88 +-
 .../tika/server/ProduceTypeResourceComparator.java |  145 +++
 .../java/org/apache/tika/server/TikaServerCli.java |   35 +-
 .../org/apache/tika/server/mbean/MBeanHelper.java  |   60 ++
 .../tika/server/mbean/ServerStatusExporter.java    |   83 ++
 .../server/mbean/ServerStatusExporterMBean.java    |   61 ++
 .../apache/tika/server/metrics/Log4JMetrics.java   |  212 ++++
 .../apache/tika/server/metrics/MetricsHelper.java  |  220 ++++
 .../tika/server/metrics/MetricsResource.java       |   53 +
 .../tika/server/metrics/ServerStatusMetrics.java   |   61 ++
 .../server/resource/RecursiveMetadataResource.java |    8 +-
 .../apache/tika/server/resource/TikaResource.java  |  204 +++-
 .../tika/server/resource/TikaServerStatus.java     |   21 +-
 .../java/org/apache/tika/server/CXFTestBase.java   |   20 +-
 .../apache/tika/server/MetricsResourceTest.java    |  118 +++
 .../tika/server/RecursiveMetadataResourceTest.java |  143 ++-
 .../org/apache/tika/server/StackTraceTest.java     |   17 +-
 .../server/TikaResourceMetadataFilterTest.java     |   83 ++
 .../tika/server/TikaResourceNoStackTest.java       |   98 ++
 .../org/apache/tika/server/TikaResourceTest.java   |  322 +++++-
 .../resources/configs/metadata-filter-include.xml  |   30 +
 .../src/test/resources/mock/hello_world.xml        |   26 +
 .../src/test/resources/mock/hello_world_long.xml   |   30 +
 tika-translate/pom.xml                             |    8 +-
 .../tika/language/translate/RTGTranslator.java     |  143 +++
 .../org.apache.tika.language.translate.Translator  |    1 +
 .../tika/language/translate/RTGTranslatorTest.java |   64 ++
 tika-xmp/pom.xml                                   |    2 +-
 107 files changed, 4519 insertions(+), 1423 deletions(-)

diff --cc tika-parent/pom.xml
index baccf43,7d3cb93..71a5928
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@@ -20,337 -20,338 +20,338 @@@
  -->
  
  <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
-   <modelVersion>4.0.0</modelVersion>
+     <modelVersion>4.0.0</modelVersion>
  
-   <parent>
-     <groupId>org.apache</groupId>
-     <artifactId>apache</artifactId>
-     <version>17</version>
-     <relativePath />
-   </parent>
+     <parent>
+         <groupId>org.apache</groupId>
+         <artifactId>apache</artifactId>
+         <version>17</version>
+         <relativePath />
+     </parent>
  
-   <groupId>org.apache.tika</groupId>
-   <artifactId>tika-parent</artifactId>
-   <version>1.26-SNAPSHOT</version>
-   <packaging>pom</packaging>
+     <groupId>org.apache.tika</groupId>
+     <artifactId>tika-parent</artifactId>
+     <version>1.27-SNAPSHOT</version>
+     <packaging>pom</packaging>
  
-   <name>Apache Tika parent</name>
-   <description>
-     Apache Tika is a toolkit for detecting and extracting metadata and
-     structured text content from various documents using existing parser
-     libraries.
-   </description>
-   <inceptionYear>2007</inceptionYear>
+     <name>Apache Tika parent</name>
+     <description>
+         Apache Tika is a toolkit for detecting and extracting metadata and
+         structured text content from various documents using existing parser
+         libraries.
+     </description>
+     <inceptionYear>2007</inceptionYear>
  
-   <url>http://tika.apache.org/</url>
+     <url>http://tika.apache.org/</url>
  
-   <issueManagement>
-     <system>JIRA</system>
-     <url>https://issues.apache.org/jira/browse/TIKA</url>
-   </issueManagement>
+     <issueManagement>
+         <system>JIRA</system>
+         <url>https://issues.apache.org/jira/browse/TIKA</url>
+     </issueManagement>
  
-   <mailingLists>
-     <mailingList>
-       <name>Development mailing list</name>
-       <subscribe>dev-subscribe@tika.apache.org</subscribe>
-       <unsubscribe>dev-unsubscribe@tika.apache.org</unsubscribe>
-       <post>dev@tika.apache.org</post>
-       <archive>https://lists.apache.org/list.html?dev@tika.apache.org</archive>
-       <otherArchives>
-         <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-dev/</otherArchive>
-         <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-dev</otherArchive>
-         <otherArchive>http://www.mail-archive.com/dev@tika.apache.org</otherArchive>
-         <otherArchive>http://www.mail-archive.com/tika-dev@lucene.apache.org/</otherArchive>
-         <otherArchive>http://www.mail-archive.com/tika-dev@incubator.apache.org/</otherArchive>
-         <otherArchive>http://www.nabble.com/Apache-Tika---Development-f20913.html</otherArchive>
-         <otherArchive>http://news.gmane.org/gmane.comp.apache.tika.devel</otherArchive>
-         <otherArchive>http://tika.markmail.org/</otherArchive>
-       </otherArchives>
-     </mailingList>
-     <mailingList>
-       <name>Commit mailing list</name>
-       <subscribe>commits-subscribe@tika.apache.org</subscribe>
-       <unsubscribe>commits-unsubscribe@tika.apache.org</unsubscribe>
-       <post>commits@tika.apache.org</post>
-       <archive>https://lists.apache.org/list.html?commits@tika.apache.org</archive>
-       <otherArchives>
-         <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-commits/</otherArchive>
-         <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-commits/</otherArchive>
-         <otherArchive>http://www.mail-archive.com/tika-commits@lucene.apache.org/</otherArchive>
-         <otherArchive>http://www.mail-archive.com/tika-commits@incubator.apache.org/</otherArchive>
-       </otherArchives>
-     </mailingList>
-     <mailingList>
-       <name>User mailing list</name>
-       <subscribe>user-subscribe@tika.apache.org</subscribe>
-       <unsubscribe>user-unsubscribe@tika.apache.org</unsubscribe>
-       <post>user@tika.apache.org</post>
-       <archive>https://lists.apache.org/list.html?user@tika.apache.org</archive>
-       <otherArchives>
-         <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-user/</otherArchive>
-         <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-user/</otherArchive>
-         <otherArchive>http://www.mail-archive.com/tika-user@lucene.apache.org/</otherArchive>
-       </otherArchives>
-     </mailingList>
-   </mailingLists>
+     <mailingLists>
+         <mailingList>
+             <name>Development mailing list</name>
+             <subscribe>dev-subscribe@tika.apache.org</subscribe>
+             <unsubscribe>dev-unsubscribe@tika.apache.org</unsubscribe>
+             <post>dev@tika.apache.org</post>
+             <archive>https://lists.apache.org/list.html?dev@tika.apache.org</archive>
+             <otherArchives>
+                 <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-dev/</otherArchive>
+                 <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-dev</otherArchive>
+                 <otherArchive>http://www.mail-archive.com/dev@tika.apache.org</otherArchive>
+                 <otherArchive>http://www.mail-archive.com/tika-dev@lucene.apache.org/</otherArchive>
+                 <otherArchive>http://www.mail-archive.com/tika-dev@incubator.apache.org/</otherArchive>
+                 <otherArchive>http://www.nabble.com/Apache-Tika---Development-f20913.html</otherArchive>
+                 <otherArchive>http://news.gmane.org/gmane.comp.apache.tika.devel</otherArchive>
+                 <otherArchive>http://tika.markmail.org/</otherArchive>
+             </otherArchives>
+         </mailingList>
+         <mailingList>
+             <name>Commit mailing list</name>
+             <subscribe>commits-subscribe@tika.apache.org</subscribe>
+             <unsubscribe>commits-unsubscribe@tika.apache.org</unsubscribe>
+             <post>commits@tika.apache.org</post>
+             <archive>https://lists.apache.org/list.html?commits@tika.apache.org</archive>
+             <otherArchives>
+                 <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-commits/</otherArchive>
+                 <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-commits/</otherArchive>
+                 <otherArchive>http://www.mail-archive.com/tika-commits@lucene.apache.org/</otherArchive>
+                 <otherArchive>http://www.mail-archive.com/tika-commits@incubator.apache.org/</otherArchive>
+             </otherArchives>
+         </mailingList>
+         <mailingList>
+             <name>User mailing list</name>
+             <subscribe>user-subscribe@tika.apache.org</subscribe>
+             <unsubscribe>user-unsubscribe@tika.apache.org</unsubscribe>
+             <post>user@tika.apache.org</post>
+             <archive>https://lists.apache.org/list.html?user@tika.apache.org</archive>
+             <otherArchives>
+                 <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-user/</otherArchive>
+                 <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-user/</otherArchive>
+                 <otherArchive>http://www.mail-archive.com/tika-user@lucene.apache.org/</otherArchive>
+             </otherArchives>
+         </mailingList>
+     </mailingLists>
  
-   <developers>
-     <developer>
-       <name>Rida Benjelloun</name>
-       <id>ridabenjelloun</id>
-       <email>ridabenjelloun@apache.org</email>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Keith Bennett</name>
-       <id>kbennett</id>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Mark Harwood</name>
-       <id>mharwood</id>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Ken Krugler</name>
-       <id>kkrugler</id>
-       <email>kkrugler@apache.org</email>
-       <url>http://ken-blog.krugler.org</url>
-       <organization>Scale Unlimited</organization>
-       <organizationUrl>http://www.scaleunlimited.com</organizationUrl>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Chris A. Mattmann</name>
-       <id>mattmann</id>
-       <email>mattmann@apache.org</email>
-       <url>http://people.apache.org/~mattmann/</url>
-       <organization>NASA Jet Propulsion Laboratory</organization>
-       <organizationUrl>http://www.jpl.nasa.gov</organizationUrl>
-       <timezone>-8</timezone>
-       <properties />
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Michael McCandless</name>
-       <id>mikemccand</id>
-       <email>mikemccand@apache.org</email>
-       <organization>IBM</organization>
-       <properties />
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Dave Meikle</name>
-       <id>dmeikle</id>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Sami Siren</name>
-       <id>siren</id>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Jukka Zitting</name>
-       <id>jukka</id>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Nick Burch</name>
-       <id>nick</id>
-       <organization>Alfresco</organization>
-       <organizationUrl>http://alfresco.com</organizationUrl>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Maxim Valyanskiy</name>
-       <id>maxcom</id>
-       <organization>Jet Infosystems</organization>
-       <roles>
-         <role>committer</role>
-       </roles>
-       <timezone>+3</timezone>
-     </developer>
-     <developer>
-       <name>Oleg Tikhonov</name>
-       <id>oleg</id>
-       <roles>
-         <role>committer</role>
-       </roles>
-       <timezone>+2</timezone>
-     </developer>
-     <developer>
-       <name>Ray Gauss II</name>
-       <id>rgauss</id>
-       <organization>Alfresco</organization>
-       <organizationUrl>http://alfresco.com</organizationUrl>
-       <timezone>-5</timezone>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Tyler Palsulich</name>
-       <id>tpalsulich</id>
-       <timezone>-8</timezone>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Tim Allison</name>
-       <id>tallison</id>
-       <timezone>-5</timezone>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Konstantin Gribov</name>
-       <id>grossws</id>
-       <timezone>+3</timezone>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-   </developers>
-   <contributors>
-     <contributor>
-       <name>Doug Cutting</name>
-       <roles>
-         <role>mentor</role>
-       </roles>
-     </contributor>
-     <contributor>
-       <name>Bertrand Delacretaz</name>
-       <roles>
-         <role>mentor</role>
-       </roles>
-     </contributor>
-     <contributor>
-       <name>Niall Pemberton</name>
-       <roles>
-         <role>emeritus</role>
-       </roles>
-     </contributor>
-   </contributors>
+     <developers>
+         <developer>
+             <name>Rida Benjelloun</name>
+             <id>ridabenjelloun</id>
+             <email>ridabenjelloun@apache.org</email>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Keith Bennett</name>
+             <id>kbennett</id>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Mark Harwood</name>
+             <id>mharwood</id>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Ken Krugler</name>
+             <id>kkrugler</id>
+             <email>kkrugler@apache.org</email>
+             <url>http://ken-blog.krugler.org</url>
+             <organization>Scale Unlimited</organization>
+             <organizationUrl>http://www.scaleunlimited.com</organizationUrl>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Chris A. Mattmann</name>
+             <id>mattmann</id>
+             <email>mattmann@apache.org</email>
+             <url>http://people.apache.org/~mattmann/</url>
+             <organization>NASA Jet Propulsion Laboratory</organization>
+             <organizationUrl>http://www.jpl.nasa.gov</organizationUrl>
+             <timezone>-8</timezone>
+             <properties />
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Michael McCandless</name>
+             <id>mikemccand</id>
+             <email>mikemccand@apache.org</email>
+             <organization>IBM</organization>
+             <properties />
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Dave Meikle</name>
+             <id>dmeikle</id>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Sami Siren</name>
+             <id>siren</id>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Jukka Zitting</name>
+             <id>jukka</id>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Nick Burch</name>
+             <id>nick</id>
+             <organization>Alfresco</organization>
+             <organizationUrl>http://alfresco.com</organizationUrl>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Maxim Valyanskiy</name>
+             <id>maxcom</id>
+             <organization>Jet Infosystems</organization>
+             <roles>
+                 <role>committer</role>
+             </roles>
+             <timezone>+3</timezone>
+         </developer>
+         <developer>
+             <name>Oleg Tikhonov</name>
+             <id>oleg</id>
+             <roles>
+                 <role>committer</role>
+             </roles>
+             <timezone>+2</timezone>
+         </developer>
+         <developer>
+             <name>Ray Gauss II</name>
+             <id>rgauss</id>
+             <organization>Alfresco</organization>
+             <organizationUrl>http://alfresco.com</organizationUrl>
+             <timezone>-5</timezone>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Tyler Palsulich</name>
+             <id>tpalsulich</id>
+             <timezone>-8</timezone>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Tim Allison</name>
+             <id>tallison</id>
+             <timezone>-5</timezone>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Konstantin Gribov</name>
+             <id>grossws</id>
+             <timezone>+3</timezone>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+     </developers>
+     <contributors>
+         <contributor>
+             <name>Doug Cutting</name>
+             <roles>
+                 <role>mentor</role>
+             </roles>
+         </contributor>
+         <contributor>
+             <name>Bertrand Delacretaz</name>
+             <roles>
+                 <role>mentor</role>
+             </roles>
+         </contributor>
+         <contributor>
+             <name>Niall Pemberton</name>
+             <roles>
+                 <role>emeritus</role>
+             </roles>
+         </contributor>
+     </contributors>
  
-   <dependencyManagement>
-     <dependencies>
-       <dependency>
-         <groupId>biz.aQute</groupId>
-         <artifactId>bndlib</artifactId>
-         <version>1.50.0</version>
-       </dependency>
-       <dependency>
-         <groupId>org.apache.felix</groupId>
-         <artifactId>org.apache.felix.scr.annotations</artifactId>
-         <version>1.12.0</version>
-       </dependency>
-       <dependency>
-         <groupId>junit</groupId>
-         <artifactId>junit</artifactId>
-         <version>4.13.1</version>
-         <scope>test</scope>
-       </dependency>
-       <dependency>
-         <groupId>org.slf4j</groupId>
-         <artifactId>slf4j-api</artifactId>
-         <version>${slf4j.version}</version>
-       </dependency>
-       <dependency>
-         <groupId>org.slf4j</groupId>
-         <artifactId>slf4j-log4j12</artifactId>
-         <version>${slf4j.version}</version>
-       </dependency>
-       <dependency>
-         <groupId>org.slf4j</groupId>
-         <artifactId>slf4j-simple</artifactId>
-         <version>${slf4j.version}</version>
-       </dependency>
-       <dependency>
-         <groupId>org.slf4j</groupId>
-         <artifactId>jcl-over-slf4j</artifactId>
-         <version>${slf4j.version}</version>
-       </dependency>
-       <dependency>
-         <groupId>org.slf4j</groupId>
-         <artifactId>jul-to-slf4j</artifactId>
-         <version>${slf4j.version}</version>
-       </dependency>
+     <dependencyManagement>
+         <dependencies>
+             <dependency>
+                 <groupId>biz.aQute</groupId>
+                 <artifactId>bndlib</artifactId>
+                 <version>1.50.0</version>
+             </dependency>
+             <dependency>
+                 <groupId>org.apache.felix</groupId>
+                 <artifactId>org.apache.felix.scr.annotations</artifactId>
+                 <version>1.12.0</version>
+             </dependency>
+             <dependency>
+                 <groupId>junit</groupId>
+                 <artifactId>junit</artifactId>
+                 <version>4.13.2</version>
+                 <scope>test</scope>
+             </dependency>
+             <dependency>
+                 <groupId>org.slf4j</groupId>
+                 <artifactId>slf4j-api</artifactId>
+                 <version>${slf4j.version}</version>
+             </dependency>
+             <dependency>
+                 <groupId>org.slf4j</groupId>
+                 <artifactId>slf4j-log4j12</artifactId>
+                 <version>${slf4j.version}</version>
+             </dependency>
+             <dependency>
+                 <groupId>org.slf4j</groupId>
+                 <artifactId>slf4j-simple</artifactId>
+                 <version>${slf4j.version}</version>
+             </dependency>
+             <dependency>
+                 <groupId>org.slf4j</groupId>
+                 <artifactId>jcl-over-slf4j</artifactId>
+                 <version>${slf4j.version}</version>
+             </dependency>
+             <dependency>
+                 <groupId>org.slf4j</groupId>
+                 <artifactId>jul-to-slf4j</artifactId>
+                 <version>${slf4j.version}</version>
+             </dependency>
  
-       <dependency>
-         <groupId>javax.annotation</groupId>
-         <artifactId>javax.annotation-api</artifactId>
-         <version>1.3.2</version>
-       </dependency>
-       <dependency>
-         <groupId>javax.xml.soap</groupId>
-         <artifactId>javax.xml.soap-api</artifactId>
-         <version>1.4.0</version>
-       </dependency>
-       <dependency>
-         <groupId>org.jvnet.staxex</groupId>
-         <artifactId>stax-ex</artifactId>
-         <version>1.8.3</version>
-       </dependency>
-     </dependencies>
-   </dependencyManagement>
+             <dependency>
+                 <groupId>javax.annotation</groupId>
+                 <artifactId>javax.annotation-api</artifactId>
+                 <version>1.3.2</version>
+             </dependency>
+             <dependency>
+                 <groupId>javax.xml.soap</groupId>
+                 <artifactId>javax.xml.soap-api</artifactId>
+                 <version>1.4.0</version>
+             </dependency>
+             <dependency>
+                 <groupId>org.jvnet.staxex</groupId>
+                 <artifactId>stax-ex</artifactId>
+                 <version>2.0.0</version>
+             </dependency>
+         </dependencies>
+     </dependencyManagement>
  
-   <properties>
-     <maven.compiler.source>1.8</maven.compiler.source>
-     <maven.compiler.target>1.8</maven.compiler.target>
-     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-     <project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
-     <!-- plugin versions -->
-     <forbiddenapis.version>3.1</forbiddenapis.version>
-     <groovy.maven.version>2.1.1</groovy.maven.version>
-     <maven.antrun.version>1.8</maven.antrun.version>
-     <maven.assembly.version>3.3.0</maven.assembly.version>
-     <maven.bundle.version>5.1.1</maven.bundle.version>
-     <maven.failsafe.version>2.22.2</maven.failsafe.version>
-     <maven.javadoc.version>3.1.1</maven.javadoc.version>
-     <maven.scr.version>1.26.4</maven.scr.version>
-     <maven.surefire.version>3.0.0-M4</maven.surefire.version>
-     <maven.shade.version>3.2.4</maven.shade.version>
-     <rat.version>0.13</rat.version>
-     <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
-     <poi.version>5.0.0</poi.version>
-     <commons.compress.version>1.20</commons.compress.version>
-     <commons.io.version>2.8.0</commons.io.version>
-     <commons.lang3.version>3.11</commons.lang3.version>
-     <gson.version>2.8.6</gson.version>
-     <guava.version>30.1-jre</guava.version>
-     <osgi.core.version>6.0.0</osgi.core.version>
+     <properties>
+         <maven.compiler.source>1.8</maven.compiler.source>
+         <maven.compiler.target>1.8</maven.compiler.target>
+         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+         <project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
+         <!-- plugin versions -->
+         <forbiddenapis.version>3.1</forbiddenapis.version>
+         <groovy.maven.version>2.1.1</groovy.maven.version>
+         <maven.antrun.version>1.8</maven.antrun.version>
+         <maven.assembly.version>3.3.0</maven.assembly.version>
+         <maven.bundle.version>5.1.1</maven.bundle.version>
+         <maven.failsafe.version>2.22.2</maven.failsafe.version>
+         <maven.javadoc.version>3.1.1</maven.javadoc.version>
+         <maven.scr.version>1.26.4</maven.scr.version>
+         <maven.surefire.version>3.0.0-M4</maven.surefire.version>
+         <maven.shade.version>3.2.4</maven.shade.version>
+         <rat.version>0.13</rat.version>
+         <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
 -        <poi.version>4.1.2</poi.version>
++        <poi.version>5.0.1-SNAPSHOT</poi.version>
+         <commons.compress.version>1.20</commons.compress.version>
+         <commons.io.version>2.8.0</commons.io.version>
+         <commons.lang3.version>3.12.0</commons.lang3.version>
+         <gson.version>2.8.6</gson.version>
+         <guava.version>30.1.1-jre</guava.version>
+         <osgi.core.version>6.0.0</osgi.core.version>
  
-     <cxf.version>3.4.2</cxf.version>
+     <cxf.version>3.4.3</cxf.version>
      <slf4j.version>1.7.30</slf4j.version>
-     <jackson.version>2.12.1</jackson.version>
+         <log4j.version>1.2.17</log4j.version>
+     <jackson.version>2.12.2</jackson.version>
      <!-- when this is next upgraded, see if we can get rid of
           javax.activation dependency in tika-server -->
-     <jaxb.version>2.3.3</jaxb.version>
+     <jaxb.version>3.0.0</jaxb.version>
      <cli.version>1.4</cli.version>
-     <lucene.version>8.7.0</lucene.version>
-     <mockito.version>3.7.7</mockito.version>
+     <lucene.version>8.8.1</lucene.version>
+     <mockito.version>3.8.0</mockito.version>
      <opennlp.version>1.9.3</opennlp.version>
    </properties>
  
diff --cc tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index e2dc17e,51b36c2..ddc607a
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@@ -22,8 -22,10 +22,9 @@@ import java.io.IOException
  import java.io.InputStream;
  import java.util.Locale;
  
+ import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 -import org.apache.commons.io.input.CloseShieldInputStream;
  import org.apache.poi.ooxml.POIXMLDocument;
 -import org.apache.poi.ooxml.extractor.ExtractorFactory;
 +import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
  import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
  import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
  import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
@@@ -113,10 -112,10 +116,10 @@@ public class OOXMLExtractorFactory 
                                  true, false)) {
                      try {
                          pkg = OPCPackage.open(rereadableInputStream);
-                     } catch (EOFException e) {
 -                    } catch (EOFException|UnsupportedZipFeatureException e) {
++                    } catch (EOFException| UnsupportedZipFeatureException e) {
                          rereadableInputStream.rewind();
                          tmpRepairedCopy = File.createTempFile("tika-ooxml-repair-", "");
-                         ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy);
+                         ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false);
                          //if there isn't enough left to be opened as a package
                          //throw an exception -- we may want to fall back to streaming
                          //parsing