You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/06/29 14:10:08 UTC

[tika] 01/03: Merge remote-tracking branch 'origin/branch_1x' into TIKA-3164-1.x

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3164-1.x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ab86545afec5cef3a05e546e4e667ea843a738ae
Merge: 7aa2732 ba72c86
Author: tallison <ta...@apache.org>
AuthorDate: Wed May 5 10:32:48 2021 -0400

    Merge remote-tracking branch 'origin/branch_1x' into TIKA-3164-1.x
    
    # Conflicts:
    #	tika-parent/pom.xml
    #	tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java

 CHANGES.txt                                        |   34 +-
 NOTICE.txt                                         |   24 +
 pom.xml                                            |    2 +-
 tika-app/pom.xml                                   |    2 +-
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |    2 +-
 .../src/main/java/org/apache/tika/gui/TikaGUI.java |    3 -
 tika-batch/pom.xml                                 |    2 +-
 .../batch/fs/RecursiveParserWrapperFSConsumer.java |    2 +-
 tika-bundle/pom.xml                                |    6 +-
 tika-core/pom.xml                                  |  377 ++++---
 .../main/assembly/test-jar-with-dependencies.xml   |   38 +
 .../apache/tika/exception/RuntimeSAXException.java |   29 +
 .../tika/exception/WriteLimitReachedException.java |   88 ++
 .../main/java/org/apache/tika/metadata/XMPDM.java  |    1 +
 .../org/apache/tika/parser/CompositeParser.java    |    4 +
 .../apache/tika/parser/RecursiveParserWrapper.java |  120 ++-
 .../sax/AbstractRecursiveParserWrapperHandler.java |   11 +-
 .../tika/sax/BasicContentHandlerFactory.java       |    3 +
 .../tika/sax/RecursiveParserWrapperHandler.java    |    9 +-
 .../org/apache/tika/sax/SecureContentHandler.java  |    2 +-
 .../org/apache/tika/mime/tika-mimetypes.xml        |    7 +
 .../src/test/java/org/apache/tika/TikaTest.java    |   15 +-
 .../tika/detect/FileCommandDetectorTest.java       |   40 +-
 .../org/apache/tika/parser/mock/MockParser.java    |  163 ++-
 .../apache/tika/parser/mock/MockParserTest.java    |   30 +
 .../resources/test-documents/mock_fakeload.xml     |   29 +
 tika-dl/pom.xml                                    |   10 +-
 tika-eval/pom.xml                                  |   33 +-
 .../java/org/apache/tika/eval/ExtractComparer.java |    2 +-
 .../java/org/apache/tika/eval/FileProfiler.java    |   25 +-
 .../src/main/resources/lucene-char-mapping.txt     |    3 +-
 .../resources/tika-eval-file-profiler-config.xml   |   10 +-
 tika-example/pom.xml                               |    4 +-
 tika-fuzzing/pom.xml                               |    2 +-
 tika-java7/pom.xml                                 |    2 +-
 tika-langdetect/pom.xml                            |    6 +-
 tika-nlp/pom.xml                                   |    6 +-
 tika-parent/pom.xml                                | 1103 ++++++++++----------
 tika-parsers/pom.xml                               |   37 +-
 .../apache/tika/parser/asm/XHTMLClassVisitor.java  |   17 +-
 .../java/org/apache/tika/parser/chm/ChmParser.java |    5 +-
 .../org/apache/tika/parser/crypto/Pkcs7Parser.java |    4 +-
 .../org/apache/tika/parser/crypto/TSDParser.java   |   11 +-
 .../tika/parser/microsoft/JackcessExtractor.java   |    7 +
 .../tika/parser/microsoft/JackcessParser.java      |    2 +-
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |   11 +-
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |    9 +-
 .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java  |  356 +++----
 .../ooxml/OOXMLWordAndPowerPointTextHandler.java   |   50 +-
 .../ooxml/XSSFExcelExtractorDecorator.java         |   11 +-
 .../microsoft/ooxml/xps/XPSExtractorDecorator.java |    5 +
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java    |    9 +
 .../microsoft/xml/AbstractXML2003Parser.java       |    5 +-
 .../java/org/apache/tika/parser/mp3/Mp3Parser.java |   13 +-
 .../java/org/apache/tika/parser/mp4/MP4Parser.java |   26 +-
 .../parser/odf/OpenDocumentManifestHandler.java    |   54 +
 .../apache/tika/parser/odf/OpenDocumentParser.java |  110 +-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |   77 +-
 .../java/org/apache/tika/parser/pdf/PDF2XHTML.java |    2 +
 .../java/org/apache/tika/parser/pdf/PDFParser.java |    2 +-
 .../org/apache/tika/parser/pkg/PackageParser.java  |   59 +-
 .../parser/pkg/StreamingZipContainerDetector.java  |  171 +--
 .../tika/parser/pkg/ZipContainerDetector.java      |   18 +-
 .../org/apache/tika/parser/utils/ZipSalvager.java  |   96 +-
 .../tika/config/TikaEncodingDetectorTest.java      |    6 +-
 .../tika/detect/TestContainerAwareDetector.java    |    2 +-
 .../tika/parser/RecursiveParserWrapperTest.java    |   15 +-
 .../parser/microsoft/ooxml/xps/XPSParserTest.java  |   45 +
 .../org/apache/tika/parser/mp3/Mp3ParserTest.java  |    6 +-
 .../org/apache/tika/parser/odf/ODFParserTest.java  |   68 ++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |   11 +
 .../apache/tika/parser/pkg/PackageParserTest.java  |   13 +-
 .../tika/parser/pkg/ZipContainerDetectorTest.java  |    2 +-
 .../src/test/resources/test-documents/gbk.zip      |  Bin 0 -> 432 bytes
 .../resources/test-documents/testODTEncrypted.odt  |  Bin 0 -> 12714 bytes
 .../testPDF_deeplyEmbeddedAttachments.pdf          |  Bin 0 -> 122221 bytes
 .../test-documents/testXPSWithDataDescriptor.xps   |  Bin 0 -> 44523 bytes
 .../test-documents/testXPSWithDataDescriptor2.xps  |  Bin 0 -> 51175 bytes
 tika-serialization/pom.xml                         |    2 +-
 tika-server/pom.xml                                |   88 +-
 .../tika/server/ProduceTypeResourceComparator.java |  145 +++
 .../java/org/apache/tika/server/TikaServerCli.java |   35 +-
 .../org/apache/tika/server/mbean/MBeanHelper.java  |   60 ++
 .../tika/server/mbean/ServerStatusExporter.java    |   83 ++
 .../server/mbean/ServerStatusExporterMBean.java    |   61 ++
 .../apache/tika/server/metrics/Log4JMetrics.java   |  212 ++++
 .../apache/tika/server/metrics/MetricsHelper.java  |  220 ++++
 .../tika/server/metrics/MetricsResource.java       |   53 +
 .../tika/server/metrics/ServerStatusMetrics.java   |   61 ++
 .../server/resource/RecursiveMetadataResource.java |    8 +-
 .../apache/tika/server/resource/TikaResource.java  |  204 +++-
 .../tika/server/resource/TikaServerStatus.java     |   21 +-
 .../java/org/apache/tika/server/CXFTestBase.java   |   20 +-
 .../apache/tika/server/MetricsResourceTest.java    |  118 +++
 .../tika/server/RecursiveMetadataResourceTest.java |  143 ++-
 .../org/apache/tika/server/StackTraceTest.java     |   17 +-
 .../server/TikaResourceMetadataFilterTest.java     |   83 ++
 .../tika/server/TikaResourceNoStackTest.java       |   98 ++
 .../org/apache/tika/server/TikaResourceTest.java   |  322 +++++-
 .../resources/configs/metadata-filter-include.xml  |   30 +
 .../src/test/resources/mock/hello_world.xml        |   26 +
 .../src/test/resources/mock/hello_world_long.xml   |   30 +
 tika-translate/pom.xml                             |    8 +-
 .../tika/language/translate/RTGTranslator.java     |  143 +++
 .../org.apache.tika.language.translate.Translator  |    1 +
 .../tika/language/translate/RTGTranslatorTest.java |   64 ++
 tika-xmp/pom.xml                                   |    2 +-
 107 files changed, 4519 insertions(+), 1423 deletions(-)

diff --cc tika-parent/pom.xml
index baccf43,7d3cb93..71a5928
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@@ -20,337 -20,338 +20,338 @@@
  -->
  
  <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
-   <modelVersion>4.0.0</modelVersion>
+     <modelVersion>4.0.0</modelVersion>
  
-   <parent>
-     <groupId>org.apache</groupId>
-     <artifactId>apache</artifactId>
-     <version>17</version>
-     <relativePath />
-   </parent>
+     <parent>
+         <groupId>org.apache</groupId>
+         <artifactId>apache</artifactId>
+         <version>17</version>
+         <relativePath />
+     </parent>
  
-   <groupId>org.apache.tika</groupId>
-   <artifactId>tika-parent</artifactId>
-   <version>1.26-SNAPSHOT</version>
-   <packaging>pom</packaging>
+     <groupId>org.apache.tika</groupId>
+     <artifactId>tika-parent</artifactId>
+     <version>1.27-SNAPSHOT</version>
+     <packaging>pom</packaging>
  
-   <name>Apache Tika parent</name>
-   <description>
-     Apache Tika is a toolkit for detecting and extracting metadata and
-     structured text content from various documents using existing parser
-     libraries.
-   </description>
-   <inceptionYear>2007</inceptionYear>
+     <name>Apache Tika parent</name>
+     <description>
+         Apache Tika is a toolkit for detecting and extracting metadata and
+         structured text content from various documents using existing parser
+         libraries.
+     </description>
+     <inceptionYear>2007</inceptionYear>
  
-   <url>http://tika.apache.org/</url>
+     <url>http://tika.apache.org/</url>
  
-   <issueManagement>
-     <system>JIRA</system>
-     <url>https://issues.apache.org/jira/browse/TIKA</url>
-   </issueManagement>
+     <issueManagement>
+         <system>JIRA</system>
+         <url>https://issues.apache.org/jira/browse/TIKA</url>
+     </issueManagement>
  
-   <mailingLists>
-     <mailingList>
-       <name>Development mailing list</name>
-       <subscribe>dev-subscribe@tika.apache.org</subscribe>
-       <unsubscribe>dev-unsubscribe@tika.apache.org</unsubscribe>
-       <post>dev@tika.apache.org</post>
-       <archive>https://lists.apache.org/list.html?dev@tika.apache.org</archive>
-       <otherArchives>
-         <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-dev/</otherArchive>
-         <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-dev</otherArchive>
-         <otherArchive>http://www.mail-archive.com/dev@tika.apache.org</otherArchive>
-         <otherArchive>http://www.mail-archive.com/tika-dev@lucene.apache.org/</otherArchive>
-         <otherArchive>http://www.mail-archive.com/tika-dev@incubator.apache.org/</otherArchive>
-         <otherArchive>http://www.nabble.com/Apache-Tika---Development-f20913.html</otherArchive>
-         <otherArchive>http://news.gmane.org/gmane.comp.apache.tika.devel</otherArchive>
-         <otherArchive>http://tika.markmail.org/</otherArchive>
-       </otherArchives>
-     </mailingList>
-     <mailingList>
-       <name>Commit mailing list</name>
-       <subscribe>commits-subscribe@tika.apache.org</subscribe>
-       <unsubscribe>commits-unsubscribe@tika.apache.org</unsubscribe>
-       <post>commits@tika.apache.org</post>
-       <archive>https://lists.apache.org/list.html?commits@tika.apache.org</archive>
-       <otherArchives>
-         <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-commits/</otherArchive>
-         <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-commits/</otherArchive>
-         <otherArchive>http://www.mail-archive.com/tika-commits@lucene.apache.org/</otherArchive>
-         <otherArchive>http://www.mail-archive.com/tika-commits@incubator.apache.org/</otherArchive>
-       </otherArchives>
-     </mailingList>
-     <mailingList>
-       <name>User mailing list</name>
-       <subscribe>user-subscribe@tika.apache.org</subscribe>
-       <unsubscribe>user-unsubscribe@tika.apache.org</unsubscribe>
-       <post>user@tika.apache.org</post>
-       <archive>https://lists.apache.org/list.html?user@tika.apache.org</archive>
-       <otherArchives>
-         <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-user/</otherArchive>
-         <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-user/</otherArchive>
-         <otherArchive>http://www.mail-archive.com/tika-user@lucene.apache.org/</otherArchive>
-       </otherArchives>
-     </mailingList>
-   </mailingLists>
+     <mailingLists>
+         <mailingList>
+             <name>Development mailing list</name>
+             <subscribe>dev-subscribe@tika.apache.org</subscribe>
+             <unsubscribe>dev-unsubscribe@tika.apache.org</unsubscribe>
+             <post>dev@tika.apache.org</post>
+             <archive>https://lists.apache.org/list.html?dev@tika.apache.org</archive>
+             <otherArchives>
+                 <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-dev/</otherArchive>
+                 <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-dev</otherArchive>
+                 <otherArchive>http://www.mail-archive.com/dev@tika.apache.org</otherArchive>
+                 <otherArchive>http://www.mail-archive.com/tika-dev@lucene.apache.org/</otherArchive>
+                 <otherArchive>http://www.mail-archive.com/tika-dev@incubator.apache.org/</otherArchive>
+                 <otherArchive>http://www.nabble.com/Apache-Tika---Development-f20913.html</otherArchive>
+                 <otherArchive>http://news.gmane.org/gmane.comp.apache.tika.devel</otherArchive>
+                 <otherArchive>http://tika.markmail.org/</otherArchive>
+             </otherArchives>
+         </mailingList>
+         <mailingList>
+             <name>Commit mailing list</name>
+             <subscribe>commits-subscribe@tika.apache.org</subscribe>
+             <unsubscribe>commits-unsubscribe@tika.apache.org</unsubscribe>
+             <post>commits@tika.apache.org</post>
+             <archive>https://lists.apache.org/list.html?commits@tika.apache.org</archive>
+             <otherArchives>
+                 <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-commits/</otherArchive>
+                 <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-commits/</otherArchive>
+                 <otherArchive>http://www.mail-archive.com/tika-commits@lucene.apache.org/</otherArchive>
+                 <otherArchive>http://www.mail-archive.com/tika-commits@incubator.apache.org/</otherArchive>
+             </otherArchives>
+         </mailingList>
+         <mailingList>
+             <name>User mailing list</name>
+             <subscribe>user-subscribe@tika.apache.org</subscribe>
+             <unsubscribe>user-unsubscribe@tika.apache.org</unsubscribe>
+             <post>user@tika.apache.org</post>
+             <archive>https://lists.apache.org/list.html?user@tika.apache.org</archive>
+             <otherArchives>
+                 <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-user/</otherArchive>
+                 <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-user/</otherArchive>
+                 <otherArchive>http://www.mail-archive.com/tika-user@lucene.apache.org/</otherArchive>
+             </otherArchives>
+         </mailingList>
+     </mailingLists>
  
-   <developers>
-     <developer>
-       <name>Rida Benjelloun</name>
-       <id>ridabenjelloun</id>
-       <email>ridabenjelloun@apache.org</email>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Keith Bennett</name>
-       <id>kbennett</id>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Mark Harwood</name>
-       <id>mharwood</id>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Ken Krugler</name>
-       <id>kkrugler</id>
-       <email>kkrugler@apache.org</email>
-       <url>http://ken-blog.krugler.org</url>
-       <organization>Scale Unlimited</organization>
-       <organizationUrl>http://www.scaleunlimited.com</organizationUrl>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Chris A. Mattmann</name>
-       <id>mattmann</id>
-       <email>mattmann@apache.org</email>
-       <url>http://people.apache.org/~mattmann/</url>
-       <organization>NASA Jet Propulsion Laboratory</organization>
-       <organizationUrl>http://www.jpl.nasa.gov</organizationUrl>
-       <timezone>-8</timezone>
-       <properties />
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Michael McCandless</name>
-       <id>mikemccand</id>
-       <email>mikemccand@apache.org</email>
-       <organization>IBM</organization>
-       <properties />
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Dave Meikle</name>
-       <id>dmeikle</id>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Sami Siren</name>
-       <id>siren</id>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Jukka Zitting</name>
-       <id>jukka</id>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Nick Burch</name>
-       <id>nick</id>
-       <organization>Alfresco</organization>
-       <organizationUrl>http://alfresco.com</organizationUrl>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Maxim Valyanskiy</name>
-       <id>maxcom</id>
-       <organization>Jet Infosystems</organization>
-       <roles>
-         <role>committer</role>
-       </roles>
-       <timezone>+3</timezone>
-     </developer>
-     <developer>
-       <name>Oleg Tikhonov</name>
-       <id>oleg</id>
-       <roles>
-         <role>committer</role>
-       </roles>
-       <timezone>+2</timezone>
-     </developer>
-     <developer>
-       <name>Ray Gauss II</name>
-       <id>rgauss</id>
-       <organization>Alfresco</organization>
-       <organizationUrl>http://alfresco.com</organizationUrl>
-       <timezone>-5</timezone>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Tyler Palsulich</name>
-       <id>tpalsulich</id>
-       <timezone>-8</timezone>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Tim Allison</name>
-       <id>tallison</id>
-       <timezone>-5</timezone>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-     <developer>
-       <name>Konstantin Gribov</name>
-       <id>grossws</id>
-       <timezone>+3</timezone>
-       <roles>
-         <role>committer</role>
-       </roles>
-     </developer>
-   </developers>
-   <contributors>
-     <contributor>
-       <name>Doug Cutting</name>
-       <roles>
-         <role>mentor</role>
-       </roles>
-     </contributor>
-     <contributor>
-       <name>Bertrand Delacretaz</name>
-       <roles>
-         <role>mentor</role>
-       </roles>
-     </contributor>
-     <contributor>
-       <name>Niall Pemberton</name>
-       <roles>
-         <role>emeritus</role>
-       </roles>
-     </contributor>
-   </contributors>
+     <developers>
+         <developer>
+             <name>Rida Benjelloun</name>
+             <id>ridabenjelloun</id>
+             <email>ridabenjelloun@apache.org</email>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Keith Bennett</name>
+             <id>kbennett</id>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Mark Harwood</name>
+             <id>mharwood</id>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Ken Krugler</name>
+             <id>kkrugler</id>
+             <email>kkrugler@apache.org</email>
+             <url>http://ken-blog.krugler.org</url>
+             <organization>Scale Unlimited</organization>
+             <organizationUrl>http://www.scaleunlimited.com</organizationUrl>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Chris A. Mattmann</name>
+             <id>mattmann</id>
+             <email>mattmann@apache.org</email>
+             <url>http://people.apache.org/~mattmann/</url>
+             <organization>NASA Jet Propulsion Laboratory</organization>
+             <organizationUrl>http://www.jpl.nasa.gov</organizationUrl>
+             <timezone>-8</timezone>
+             <properties />
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Michael McCandless</name>
+             <id>mikemccand</id>
+             <email>mikemccand@apache.org</email>
+             <organization>IBM</organization>
+             <properties />
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Dave Meikle</name>
+             <id>dmeikle</id>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Sami Siren</name>
+             <id>siren</id>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Jukka Zitting</name>
+             <id>jukka</id>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Nick Burch</name>
+             <id>nick</id>
+             <organization>Alfresco</organization>
+             <organizationUrl>http://alfresco.com</organizationUrl>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Maxim Valyanskiy</name>
+             <id>maxcom</id>
+             <organization>Jet Infosystems</organization>
+             <roles>
+                 <role>committer</role>
+             </roles>
+             <timezone>+3</timezone>
+         </developer>
+         <developer>
+             <name>Oleg Tikhonov</name>
+             <id>oleg</id>
+             <roles>
+                 <role>committer</role>
+             </roles>
+             <timezone>+2</timezone>
+         </developer>
+         <developer>
+             <name>Ray Gauss II</name>
+             <id>rgauss</id>
+             <organization>Alfresco</organization>
+             <organizationUrl>http://alfresco.com</organizationUrl>
+             <timezone>-5</timezone>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Tyler Palsulich</name>
+             <id>tpalsulich</id>
+             <timezone>-8</timezone>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Tim Allison</name>
+             <id>tallison</id>
+             <timezone>-5</timezone>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+         <developer>
+             <name>Konstantin Gribov</name>
+             <id>grossws</id>
+             <timezone>+3</timezone>
+             <roles>
+                 <role>committer</role>
+             </roles>
+         </developer>
+     </developers>
+     <contributors>
+         <contributor>
+             <name>Doug Cutting</name>
+             <roles>
+                 <role>mentor</role>
+             </roles>
+         </contributor>
+         <contributor>
+             <name>Bertrand Delacretaz</name>
+             <roles>
+                 <role>mentor</role>
+             </roles>
+         </contributor>
+         <contributor>
+             <name>Niall Pemberton</name>
+             <roles>
+                 <role>emeritus</role>
+             </roles>
+         </contributor>
+     </contributors>
  
-   <dependencyManagement>
-     <dependencies>
-       <dependency>
-         <groupId>biz.aQute</groupId>
-         <artifactId>bndlib</artifactId>
-         <version>1.50.0</version>
-       </dependency>
-       <dependency>
-         <groupId>org.apache.felix</groupId>
-         <artifactId>org.apache.felix.scr.annotations</artifactId>
-         <version>1.12.0</version>
-       </dependency>
-       <dependency>
-         <groupId>junit</groupId>
-         <artifactId>junit</artifactId>
-         <version>4.13.1</version>
-         <scope>test</scope>
-       </dependency>
-       <dependency>
-         <groupId>org.slf4j</groupId>
-         <artifactId>slf4j-api</artifactId>
-         <version>${slf4j.version}</version>
-       </dependency>
-       <dependency>
-         <groupId>org.slf4j</groupId>
-         <artifactId>slf4j-log4j12</artifactId>
-         <version>${slf4j.version}</version>
-       </dependency>
-       <dependency>
-         <groupId>org.slf4j</groupId>
-         <artifactId>slf4j-simple</artifactId>
-         <version>${slf4j.version}</version>
-       </dependency>
-       <dependency>
-         <groupId>org.slf4j</groupId>
-         <artifactId>jcl-over-slf4j</artifactId>
-         <version>${slf4j.version}</version>
-       </dependency>
-       <dependency>
-         <groupId>org.slf4j</groupId>
-         <artifactId>jul-to-slf4j</artifactId>
-         <version>${slf4j.version}</version>
-       </dependency>
+     <dependencyManagement>
+         <dependencies>
+             <dependency>
+                 <groupId>biz.aQute</groupId>
+                 <artifactId>bndlib</artifactId>
+                 <version>1.50.0</version>
+             </dependency>
+             <dependency>
+                 <groupId>org.apache.felix</groupId>
+                 <artifactId>org.apache.felix.scr.annotations</artifactId>
+                 <version>1.12.0</version>
+             </dependency>
+             <dependency>
+                 <groupId>junit</groupId>
+                 <artifactId>junit</artifactId>
+                 <version>4.13.2</version>
+                 <scope>test</scope>
+             </dependency>
+             <dependency>
+                 <groupId>org.slf4j</groupId>
+                 <artifactId>slf4j-api</artifactId>
+                 <version>${slf4j.version}</version>
+             </dependency>
+             <dependency>
+                 <groupId>org.slf4j</groupId>
+                 <artifactId>slf4j-log4j12</artifactId>
+                 <version>${slf4j.version}</version>
+             </dependency>
+             <dependency>
+                 <groupId>org.slf4j</groupId>
+                 <artifactId>slf4j-simple</artifactId>
+                 <version>${slf4j.version}</version>
+             </dependency>
+             <dependency>
+                 <groupId>org.slf4j</groupId>
+                 <artifactId>jcl-over-slf4j</artifactId>
+                 <version>${slf4j.version}</version>
+             </dependency>
+             <dependency>
+                 <groupId>org.slf4j</groupId>
+                 <artifactId>jul-to-slf4j</artifactId>
+                 <version>${slf4j.version}</version>
+             </dependency>
  
-       <dependency>
-         <groupId>javax.annotation</groupId>
-         <artifactId>javax.annotation-api</artifactId>
-         <version>1.3.2</version>
-       </dependency>
-       <dependency>
-         <groupId>javax.xml.soap</groupId>
-         <artifactId>javax.xml.soap-api</artifactId>
-         <version>1.4.0</version>
-       </dependency>
-       <dependency>
-         <groupId>org.jvnet.staxex</groupId>
-         <artifactId>stax-ex</artifactId>
-         <version>1.8.3</version>
-       </dependency>
-     </dependencies>
-   </dependencyManagement>
+             <dependency>
+                 <groupId>javax.annotation</groupId>
+                 <artifactId>javax.annotation-api</artifactId>
+                 <version>1.3.2</version>
+             </dependency>
+             <dependency>
+                 <groupId>javax.xml.soap</groupId>
+                 <artifactId>javax.xml.soap-api</artifactId>
+                 <version>1.4.0</version>
+             </dependency>
+             <dependency>
+                 <groupId>org.jvnet.staxex</groupId>
+                 <artifactId>stax-ex</artifactId>
+                 <version>2.0.0</version>
+             </dependency>
+         </dependencies>
+     </dependencyManagement>
  
-   <properties>
-     <maven.compiler.source>1.8</maven.compiler.source>
-     <maven.compiler.target>1.8</maven.compiler.target>
-     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-     <project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
-     <!-- plugin versions -->
-     <forbiddenapis.version>3.1</forbiddenapis.version>
-     <groovy.maven.version>2.1.1</groovy.maven.version>
-     <maven.antrun.version>1.8</maven.antrun.version>
-     <maven.assembly.version>3.3.0</maven.assembly.version>
-     <maven.bundle.version>5.1.1</maven.bundle.version>
-     <maven.failsafe.version>2.22.2</maven.failsafe.version>
-     <maven.javadoc.version>3.1.1</maven.javadoc.version>
-     <maven.scr.version>1.26.4</maven.scr.version>
-     <maven.surefire.version>3.0.0-M4</maven.surefire.version>
-     <maven.shade.version>3.2.4</maven.shade.version>
-     <rat.version>0.13</rat.version>
-     <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
-     <poi.version>5.0.0</poi.version>
-     <commons.compress.version>1.20</commons.compress.version>
-     <commons.io.version>2.8.0</commons.io.version>
-     <commons.lang3.version>3.11</commons.lang3.version>
-     <gson.version>2.8.6</gson.version>
-     <guava.version>30.1-jre</guava.version>
-     <osgi.core.version>6.0.0</osgi.core.version>
+     <properties>
+         <maven.compiler.source>1.8</maven.compiler.source>
+         <maven.compiler.target>1.8</maven.compiler.target>
+         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+         <project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
+         <!-- plugin versions -->
+         <forbiddenapis.version>3.1</forbiddenapis.version>
+         <groovy.maven.version>2.1.1</groovy.maven.version>
+         <maven.antrun.version>1.8</maven.antrun.version>
+         <maven.assembly.version>3.3.0</maven.assembly.version>
+         <maven.bundle.version>5.1.1</maven.bundle.version>
+         <maven.failsafe.version>2.22.2</maven.failsafe.version>
+         <maven.javadoc.version>3.1.1</maven.javadoc.version>
+         <maven.scr.version>1.26.4</maven.scr.version>
+         <maven.surefire.version>3.0.0-M4</maven.surefire.version>
+         <maven.shade.version>3.2.4</maven.shade.version>
+         <rat.version>0.13</rat.version>
+         <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
 -        <poi.version>4.1.2</poi.version>
++        <poi.version>5.0.1-SNAPSHOT</poi.version>
+         <commons.compress.version>1.20</commons.compress.version>
+         <commons.io.version>2.8.0</commons.io.version>
+         <commons.lang3.version>3.12.0</commons.lang3.version>
+         <gson.version>2.8.6</gson.version>
+         <guava.version>30.1.1-jre</guava.version>
+         <osgi.core.version>6.0.0</osgi.core.version>
  
-     <cxf.version>3.4.2</cxf.version>
+     <cxf.version>3.4.3</cxf.version>
      <slf4j.version>1.7.30</slf4j.version>
-     <jackson.version>2.12.1</jackson.version>
+         <log4j.version>1.2.17</log4j.version>
+     <jackson.version>2.12.2</jackson.version>
      <!-- when this is next upgraded, see if we can get rid of
           javax.activation dependency in tika-server -->
-     <jaxb.version>2.3.3</jaxb.version>
+     <jaxb.version>3.0.0</jaxb.version>
      <cli.version>1.4</cli.version>
-     <lucene.version>8.7.0</lucene.version>
-     <mockito.version>3.7.7</mockito.version>
+     <lucene.version>8.8.1</lucene.version>
+     <mockito.version>3.8.0</mockito.version>
      <opennlp.version>1.9.3</opennlp.version>
    </properties>
  
diff --cc tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index e2dc17e,51b36c2..ddc607a
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@@ -22,8 -22,10 +22,9 @@@ import java.io.IOException
  import java.io.InputStream;
  import java.util.Locale;
  
+ import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 -import org.apache.commons.io.input.CloseShieldInputStream;
  import org.apache.poi.ooxml.POIXMLDocument;
 -import org.apache.poi.ooxml.extractor.ExtractorFactory;
 +import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
  import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
  import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
  import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
@@@ -113,10 -112,10 +116,10 @@@ public class OOXMLExtractorFactory 
                                  true, false)) {
                      try {
                          pkg = OPCPackage.open(rereadableInputStream);
-                     } catch (EOFException e) {
 -                    } catch (EOFException|UnsupportedZipFeatureException e) {
++                    } catch (EOFException| UnsupportedZipFeatureException e) {
                          rereadableInputStream.rewind();
                          tmpRepairedCopy = File.createTempFile("tika-ooxml-repair-", "");
-                         ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy);
+                         ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false);
                          //if there isn't enough left to be opened as a package
                          //throw an exception -- we may want to fall back to streaming
                          //parsing