You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/06/29 14:10:08 UTC
[tika] 01/03: Merge remote-tracking branch 'origin/branch_1x' into
TIKA-3164-1.x
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3164-1.x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit ab86545afec5cef3a05e546e4e667ea843a738ae
Merge: 7aa2732 ba72c86
Author: tallison <ta...@apache.org>
AuthorDate: Wed May 5 10:32:48 2021 -0400
Merge remote-tracking branch 'origin/branch_1x' into TIKA-3164-1.x
# Conflicts:
# tika-parent/pom.xml
# tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
CHANGES.txt | 34 +-
NOTICE.txt | 24 +
pom.xml | 2 +-
tika-app/pom.xml | 2 +-
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +-
.../src/main/java/org/apache/tika/gui/TikaGUI.java | 3 -
tika-batch/pom.xml | 2 +-
.../batch/fs/RecursiveParserWrapperFSConsumer.java | 2 +-
tika-bundle/pom.xml | 6 +-
tika-core/pom.xml | 377 ++++---
.../main/assembly/test-jar-with-dependencies.xml | 38 +
.../apache/tika/exception/RuntimeSAXException.java | 29 +
.../tika/exception/WriteLimitReachedException.java | 88 ++
.../main/java/org/apache/tika/metadata/XMPDM.java | 1 +
.../org/apache/tika/parser/CompositeParser.java | 4 +
.../apache/tika/parser/RecursiveParserWrapper.java | 120 ++-
.../sax/AbstractRecursiveParserWrapperHandler.java | 11 +-
.../tika/sax/BasicContentHandlerFactory.java | 3 +
.../tika/sax/RecursiveParserWrapperHandler.java | 9 +-
.../org/apache/tika/sax/SecureContentHandler.java | 2 +-
.../org/apache/tika/mime/tika-mimetypes.xml | 7 +
.../src/test/java/org/apache/tika/TikaTest.java | 15 +-
.../tika/detect/FileCommandDetectorTest.java | 40 +-
.../org/apache/tika/parser/mock/MockParser.java | 163 ++-
.../apache/tika/parser/mock/MockParserTest.java | 30 +
.../resources/test-documents/mock_fakeload.xml | 29 +
tika-dl/pom.xml | 10 +-
tika-eval/pom.xml | 33 +-
.../java/org/apache/tika/eval/ExtractComparer.java | 2 +-
.../java/org/apache/tika/eval/FileProfiler.java | 25 +-
.../src/main/resources/lucene-char-mapping.txt | 3 +-
.../resources/tika-eval-file-profiler-config.xml | 10 +-
tika-example/pom.xml | 4 +-
tika-fuzzing/pom.xml | 2 +-
tika-java7/pom.xml | 2 +-
tika-langdetect/pom.xml | 6 +-
tika-nlp/pom.xml | 6 +-
tika-parent/pom.xml | 1103 ++++++++++----------
tika-parsers/pom.xml | 37 +-
.../apache/tika/parser/asm/XHTMLClassVisitor.java | 17 +-
.../java/org/apache/tika/parser/chm/ChmParser.java | 5 +-
.../org/apache/tika/parser/crypto/Pkcs7Parser.java | 4 +-
.../org/apache/tika/parser/crypto/TSDParser.java | 11 +-
.../tika/parser/microsoft/JackcessExtractor.java | 7 +
.../tika/parser/microsoft/JackcessParser.java | 2 +-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 11 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 9 +-
.../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 356 +++----
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 50 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 11 +-
.../microsoft/ooxml/xps/XPSExtractorDecorator.java | 5 +
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 9 +
.../microsoft/xml/AbstractXML2003Parser.java | 5 +-
.../java/org/apache/tika/parser/mp3/Mp3Parser.java | 13 +-
.../java/org/apache/tika/parser/mp4/MP4Parser.java | 26 +-
.../parser/odf/OpenDocumentManifestHandler.java | 54 +
.../apache/tika/parser/odf/OpenDocumentParser.java | 110 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 77 +-
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 2 +
.../java/org/apache/tika/parser/pdf/PDFParser.java | 2 +-
.../org/apache/tika/parser/pkg/PackageParser.java | 59 +-
.../parser/pkg/StreamingZipContainerDetector.java | 171 +--
.../tika/parser/pkg/ZipContainerDetector.java | 18 +-
.../org/apache/tika/parser/utils/ZipSalvager.java | 96 +-
.../tika/config/TikaEncodingDetectorTest.java | 6 +-
.../tika/detect/TestContainerAwareDetector.java | 2 +-
.../tika/parser/RecursiveParserWrapperTest.java | 15 +-
.../parser/microsoft/ooxml/xps/XPSParserTest.java | 45 +
.../org/apache/tika/parser/mp3/Mp3ParserTest.java | 6 +-
.../org/apache/tika/parser/odf/ODFParserTest.java | 68 ++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 11 +
.../apache/tika/parser/pkg/PackageParserTest.java | 13 +-
.../tika/parser/pkg/ZipContainerDetectorTest.java | 2 +-
.../src/test/resources/test-documents/gbk.zip | Bin 0 -> 432 bytes
.../resources/test-documents/testODTEncrypted.odt | Bin 0 -> 12714 bytes
.../testPDF_deeplyEmbeddedAttachments.pdf | Bin 0 -> 122221 bytes
.../test-documents/testXPSWithDataDescriptor.xps | Bin 0 -> 44523 bytes
.../test-documents/testXPSWithDataDescriptor2.xps | Bin 0 -> 51175 bytes
tika-serialization/pom.xml | 2 +-
tika-server/pom.xml | 88 +-
.../tika/server/ProduceTypeResourceComparator.java | 145 +++
.../java/org/apache/tika/server/TikaServerCli.java | 35 +-
.../org/apache/tika/server/mbean/MBeanHelper.java | 60 ++
.../tika/server/mbean/ServerStatusExporter.java | 83 ++
.../server/mbean/ServerStatusExporterMBean.java | 61 ++
.../apache/tika/server/metrics/Log4JMetrics.java | 212 ++++
.../apache/tika/server/metrics/MetricsHelper.java | 220 ++++
.../tika/server/metrics/MetricsResource.java | 53 +
.../tika/server/metrics/ServerStatusMetrics.java | 61 ++
.../server/resource/RecursiveMetadataResource.java | 8 +-
.../apache/tika/server/resource/TikaResource.java | 204 +++-
.../tika/server/resource/TikaServerStatus.java | 21 +-
.../java/org/apache/tika/server/CXFTestBase.java | 20 +-
.../apache/tika/server/MetricsResourceTest.java | 118 +++
.../tika/server/RecursiveMetadataResourceTest.java | 143 ++-
.../org/apache/tika/server/StackTraceTest.java | 17 +-
.../server/TikaResourceMetadataFilterTest.java | 83 ++
.../tika/server/TikaResourceNoStackTest.java | 98 ++
.../org/apache/tika/server/TikaResourceTest.java | 322 +++++-
.../resources/configs/metadata-filter-include.xml | 30 +
.../src/test/resources/mock/hello_world.xml | 26 +
.../src/test/resources/mock/hello_world_long.xml | 30 +
tika-translate/pom.xml | 8 +-
.../tika/language/translate/RTGTranslator.java | 143 +++
.../org.apache.tika.language.translate.Translator | 1 +
.../tika/language/translate/RTGTranslatorTest.java | 64 ++
tika-xmp/pom.xml | 2 +-
107 files changed, 4519 insertions(+), 1423 deletions(-)
diff --cc tika-parent/pom.xml
index baccf43,7d3cb93..71a5928
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@@ -20,337 -20,338 +20,338 @@@
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
- <modelVersion>4.0.0</modelVersion>
+ <modelVersion>4.0.0</modelVersion>
- <parent>
- <groupId>org.apache</groupId>
- <artifactId>apache</artifactId>
- <version>17</version>
- <relativePath />
- </parent>
+ <parent>
+ <groupId>org.apache</groupId>
+ <artifactId>apache</artifactId>
+ <version>17</version>
+ <relativePath />
+ </parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parent</artifactId>
- <version>1.26-SNAPSHOT</version>
- <packaging>pom</packaging>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parent</artifactId>
+ <version>1.27-SNAPSHOT</version>
+ <packaging>pom</packaging>
- <name>Apache Tika parent</name>
- <description>
- Apache Tika is a toolkit for detecting and extracting metadata and
- structured text content from various documents using existing parser
- libraries.
- </description>
- <inceptionYear>2007</inceptionYear>
+ <name>Apache Tika parent</name>
+ <description>
+ Apache Tika is a toolkit for detecting and extracting metadata and
+ structured text content from various documents using existing parser
+ libraries.
+ </description>
+ <inceptionYear>2007</inceptionYear>
- <url>http://tika.apache.org/</url>
+ <url>http://tika.apache.org/</url>
- <issueManagement>
- <system>JIRA</system>
- <url>https://issues.apache.org/jira/browse/TIKA</url>
- </issueManagement>
+ <issueManagement>
+ <system>JIRA</system>
+ <url>https://issues.apache.org/jira/browse/TIKA</url>
+ </issueManagement>
- <mailingLists>
- <mailingList>
- <name>Development mailing list</name>
- <subscribe>dev-subscribe@tika.apache.org</subscribe>
- <unsubscribe>dev-unsubscribe@tika.apache.org</unsubscribe>
- <post>dev@tika.apache.org</post>
- <archive>https://lists.apache.org/list.html?dev@tika.apache.org</archive>
- <otherArchives>
- <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-dev/</otherArchive>
- <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-dev</otherArchive>
- <otherArchive>http://www.mail-archive.com/dev@tika.apache.org</otherArchive>
- <otherArchive>http://www.mail-archive.com/tika-dev@lucene.apache.org/</otherArchive>
- <otherArchive>http://www.mail-archive.com/tika-dev@incubator.apache.org/</otherArchive>
- <otherArchive>http://www.nabble.com/Apache-Tika---Development-f20913.html</otherArchive>
- <otherArchive>http://news.gmane.org/gmane.comp.apache.tika.devel</otherArchive>
- <otherArchive>http://tika.markmail.org/</otherArchive>
- </otherArchives>
- </mailingList>
- <mailingList>
- <name>Commit mailing list</name>
- <subscribe>commits-subscribe@tika.apache.org</subscribe>
- <unsubscribe>commits-unsubscribe@tika.apache.org</unsubscribe>
- <post>commits@tika.apache.org</post>
- <archive>https://lists.apache.org/list.html?commits@tika.apache.org</archive>
- <otherArchives>
- <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-commits/</otherArchive>
- <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-commits/</otherArchive>
- <otherArchive>http://www.mail-archive.com/tika-commits@lucene.apache.org/</otherArchive>
- <otherArchive>http://www.mail-archive.com/tika-commits@incubator.apache.org/</otherArchive>
- </otherArchives>
- </mailingList>
- <mailingList>
- <name>User mailing list</name>
- <subscribe>user-subscribe@tika.apache.org</subscribe>
- <unsubscribe>user-unsubscribe@tika.apache.org</unsubscribe>
- <post>user@tika.apache.org</post>
- <archive>https://lists.apache.org/list.html?user@tika.apache.org</archive>
- <otherArchives>
- <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-user/</otherArchive>
- <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-user/</otherArchive>
- <otherArchive>http://www.mail-archive.com/tika-user@lucene.apache.org/</otherArchive>
- </otherArchives>
- </mailingList>
- </mailingLists>
+ <mailingLists>
+ <mailingList>
+ <name>Development mailing list</name>
+ <subscribe>dev-subscribe@tika.apache.org</subscribe>
+ <unsubscribe>dev-unsubscribe@tika.apache.org</unsubscribe>
+ <post>dev@tika.apache.org</post>
+ <archive>https://lists.apache.org/list.html?dev@tika.apache.org</archive>
+ <otherArchives>
+ <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-dev/</otherArchive>
+ <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-dev</otherArchive>
+ <otherArchive>http://www.mail-archive.com/dev@tika.apache.org</otherArchive>
+ <otherArchive>http://www.mail-archive.com/tika-dev@lucene.apache.org/</otherArchive>
+ <otherArchive>http://www.mail-archive.com/tika-dev@incubator.apache.org/</otherArchive>
+ <otherArchive>http://www.nabble.com/Apache-Tika---Development-f20913.html</otherArchive>
+ <otherArchive>http://news.gmane.org/gmane.comp.apache.tika.devel</otherArchive>
+ <otherArchive>http://tika.markmail.org/</otherArchive>
+ </otherArchives>
+ </mailingList>
+ <mailingList>
+ <name>Commit mailing list</name>
+ <subscribe>commits-subscribe@tika.apache.org</subscribe>
+ <unsubscribe>commits-unsubscribe@tika.apache.org</unsubscribe>
+ <post>commits@tika.apache.org</post>
+ <archive>https://lists.apache.org/list.html?commits@tika.apache.org</archive>
+ <otherArchives>
+ <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-commits/</otherArchive>
+ <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-commits/</otherArchive>
+ <otherArchive>http://www.mail-archive.com/tika-commits@lucene.apache.org/</otherArchive>
+ <otherArchive>http://www.mail-archive.com/tika-commits@incubator.apache.org/</otherArchive>
+ </otherArchives>
+ </mailingList>
+ <mailingList>
+ <name>User mailing list</name>
+ <subscribe>user-subscribe@tika.apache.org</subscribe>
+ <unsubscribe>user-unsubscribe@tika.apache.org</unsubscribe>
+ <post>user@tika.apache.org</post>
+ <archive>https://lists.apache.org/list.html?user@tika.apache.org</archive>
+ <otherArchives>
+ <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-user/</otherArchive>
+ <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-user/</otherArchive>
+ <otherArchive>http://www.mail-archive.com/tika-user@lucene.apache.org/</otherArchive>
+ </otherArchives>
+ </mailingList>
+ </mailingLists>
- <developers>
- <developer>
- <name>Rida Benjelloun</name>
- <id>ridabenjelloun</id>
- <email>ridabenjelloun@apache.org</email>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Keith Bennett</name>
- <id>kbennett</id>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Mark Harwood</name>
- <id>mharwood</id>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Ken Krugler</name>
- <id>kkrugler</id>
- <email>kkrugler@apache.org</email>
- <url>http://ken-blog.krugler.org</url>
- <organization>Scale Unlimited</organization>
- <organizationUrl>http://www.scaleunlimited.com</organizationUrl>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Chris A. Mattmann</name>
- <id>mattmann</id>
- <email>mattmann@apache.org</email>
- <url>http://people.apache.org/~mattmann/</url>
- <organization>NASA Jet Propulsion Laboratory</organization>
- <organizationUrl>http://www.jpl.nasa.gov</organizationUrl>
- <timezone>-8</timezone>
- <properties />
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Michael McCandless</name>
- <id>mikemccand</id>
- <email>mikemccand@apache.org</email>
- <organization>IBM</organization>
- <properties />
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Dave Meikle</name>
- <id>dmeikle</id>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Sami Siren</name>
- <id>siren</id>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Jukka Zitting</name>
- <id>jukka</id>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Nick Burch</name>
- <id>nick</id>
- <organization>Alfresco</organization>
- <organizationUrl>http://alfresco.com</organizationUrl>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Maxim Valyanskiy</name>
- <id>maxcom</id>
- <organization>Jet Infosystems</organization>
- <roles>
- <role>committer</role>
- </roles>
- <timezone>+3</timezone>
- </developer>
- <developer>
- <name>Oleg Tikhonov</name>
- <id>oleg</id>
- <roles>
- <role>committer</role>
- </roles>
- <timezone>+2</timezone>
- </developer>
- <developer>
- <name>Ray Gauss II</name>
- <id>rgauss</id>
- <organization>Alfresco</organization>
- <organizationUrl>http://alfresco.com</organizationUrl>
- <timezone>-5</timezone>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Tyler Palsulich</name>
- <id>tpalsulich</id>
- <timezone>-8</timezone>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Tim Allison</name>
- <id>tallison</id>
- <timezone>-5</timezone>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Konstantin Gribov</name>
- <id>grossws</id>
- <timezone>+3</timezone>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- </developers>
- <contributors>
- <contributor>
- <name>Doug Cutting</name>
- <roles>
- <role>mentor</role>
- </roles>
- </contributor>
- <contributor>
- <name>Bertrand Delacretaz</name>
- <roles>
- <role>mentor</role>
- </roles>
- </contributor>
- <contributor>
- <name>Niall Pemberton</name>
- <roles>
- <role>emeritus</role>
- </roles>
- </contributor>
- </contributors>
+ <developers>
+ <developer>
+ <name>Rida Benjelloun</name>
+ <id>ridabenjelloun</id>
+ <email>ridabenjelloun@apache.org</email>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Keith Bennett</name>
+ <id>kbennett</id>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Mark Harwood</name>
+ <id>mharwood</id>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Ken Krugler</name>
+ <id>kkrugler</id>
+ <email>kkrugler@apache.org</email>
+ <url>http://ken-blog.krugler.org</url>
+ <organization>Scale Unlimited</organization>
+ <organizationUrl>http://www.scaleunlimited.com</organizationUrl>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Chris A. Mattmann</name>
+ <id>mattmann</id>
+ <email>mattmann@apache.org</email>
+ <url>http://people.apache.org/~mattmann/</url>
+ <organization>NASA Jet Propulsion Laboratory</organization>
+ <organizationUrl>http://www.jpl.nasa.gov</organizationUrl>
+ <timezone>-8</timezone>
+ <properties />
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Michael McCandless</name>
+ <id>mikemccand</id>
+ <email>mikemccand@apache.org</email>
+ <organization>IBM</organization>
+ <properties />
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Dave Meikle</name>
+ <id>dmeikle</id>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Sami Siren</name>
+ <id>siren</id>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Jukka Zitting</name>
+ <id>jukka</id>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Nick Burch</name>
+ <id>nick</id>
+ <organization>Alfresco</organization>
+ <organizationUrl>http://alfresco.com</organizationUrl>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Maxim Valyanskiy</name>
+ <id>maxcom</id>
+ <organization>Jet Infosystems</organization>
+ <roles>
+ <role>committer</role>
+ </roles>
+ <timezone>+3</timezone>
+ </developer>
+ <developer>
+ <name>Oleg Tikhonov</name>
+ <id>oleg</id>
+ <roles>
+ <role>committer</role>
+ </roles>
+ <timezone>+2</timezone>
+ </developer>
+ <developer>
+ <name>Ray Gauss II</name>
+ <id>rgauss</id>
+ <organization>Alfresco</organization>
+ <organizationUrl>http://alfresco.com</organizationUrl>
+ <timezone>-5</timezone>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Tyler Palsulich</name>
+ <id>tpalsulich</id>
+ <timezone>-8</timezone>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Tim Allison</name>
+ <id>tallison</id>
+ <timezone>-5</timezone>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Konstantin Gribov</name>
+ <id>grossws</id>
+ <timezone>+3</timezone>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ </developers>
+ <contributors>
+ <contributor>
+ <name>Doug Cutting</name>
+ <roles>
+ <role>mentor</role>
+ </roles>
+ </contributor>
+ <contributor>
+ <name>Bertrand Delacretaz</name>
+ <roles>
+ <role>mentor</role>
+ </roles>
+ </contributor>
+ <contributor>
+ <name>Niall Pemberton</name>
+ <roles>
+ <role>emeritus</role>
+ </roles>
+ </contributor>
+ </contributors>
- <dependencyManagement>
- <dependencies>
- <dependency>
- <groupId>biz.aQute</groupId>
- <artifactId>bndlib</artifactId>
- <version>1.50.0</version>
- </dependency>
- <dependency>
- <groupId>org.apache.felix</groupId>
- <artifactId>org.apache.felix.scr.annotations</artifactId>
- <version>1.12.0</version>
- </dependency>
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>4.13.1</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- <version>${slf4j.version}</version>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- <version>${slf4j.version}</version>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-simple</artifactId>
- <version>${slf4j.version}</version>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>jcl-over-slf4j</artifactId>
- <version>${slf4j.version}</version>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>jul-to-slf4j</artifactId>
- <version>${slf4j.version}</version>
- </dependency>
+ <dependencyManagement>
+ <dependencies>
+ <dependency>
+ <groupId>biz.aQute</groupId>
+ <artifactId>bndlib</artifactId>
+ <version>1.50.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.scr.annotations</artifactId>
+ <version>1.12.0</version>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.13.2</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-simple</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>jcl-over-slf4j</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>jul-to-slf4j</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
- <dependency>
- <groupId>javax.annotation</groupId>
- <artifactId>javax.annotation-api</artifactId>
- <version>1.3.2</version>
- </dependency>
- <dependency>
- <groupId>javax.xml.soap</groupId>
- <artifactId>javax.xml.soap-api</artifactId>
- <version>1.4.0</version>
- </dependency>
- <dependency>
- <groupId>org.jvnet.staxex</groupId>
- <artifactId>stax-ex</artifactId>
- <version>1.8.3</version>
- </dependency>
- </dependencies>
- </dependencyManagement>
+ <dependency>
+ <groupId>javax.annotation</groupId>
+ <artifactId>javax.annotation-api</artifactId>
+ <version>1.3.2</version>
+ </dependency>
+ <dependency>
+ <groupId>javax.xml.soap</groupId>
+ <artifactId>javax.xml.soap-api</artifactId>
+ <version>1.4.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.jvnet.staxex</groupId>
+ <artifactId>stax-ex</artifactId>
+ <version>2.0.0</version>
+ </dependency>
+ </dependencies>
+ </dependencyManagement>
- <properties>
- <maven.compiler.source>1.8</maven.compiler.source>
- <maven.compiler.target>1.8</maven.compiler.target>
- <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
- <project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
- <!-- plugin versions -->
- <forbiddenapis.version>3.1</forbiddenapis.version>
- <groovy.maven.version>2.1.1</groovy.maven.version>
- <maven.antrun.version>1.8</maven.antrun.version>
- <maven.assembly.version>3.3.0</maven.assembly.version>
- <maven.bundle.version>5.1.1</maven.bundle.version>
- <maven.failsafe.version>2.22.2</maven.failsafe.version>
- <maven.javadoc.version>3.1.1</maven.javadoc.version>
- <maven.scr.version>1.26.4</maven.scr.version>
- <maven.surefire.version>3.0.0-M4</maven.surefire.version>
- <maven.shade.version>3.2.4</maven.shade.version>
- <rat.version>0.13</rat.version>
- <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
- <poi.version>5.0.0</poi.version>
- <commons.compress.version>1.20</commons.compress.version>
- <commons.io.version>2.8.0</commons.io.version>
- <commons.lang3.version>3.11</commons.lang3.version>
- <gson.version>2.8.6</gson.version>
- <guava.version>30.1-jre</guava.version>
- <osgi.core.version>6.0.0</osgi.core.version>
+ <properties>
+ <maven.compiler.source>1.8</maven.compiler.source>
+ <maven.compiler.target>1.8</maven.compiler.target>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
+ <!-- plugin versions -->
+ <forbiddenapis.version>3.1</forbiddenapis.version>
+ <groovy.maven.version>2.1.1</groovy.maven.version>
+ <maven.antrun.version>1.8</maven.antrun.version>
+ <maven.assembly.version>3.3.0</maven.assembly.version>
+ <maven.bundle.version>5.1.1</maven.bundle.version>
+ <maven.failsafe.version>2.22.2</maven.failsafe.version>
+ <maven.javadoc.version>3.1.1</maven.javadoc.version>
+ <maven.scr.version>1.26.4</maven.scr.version>
+ <maven.surefire.version>3.0.0-M4</maven.surefire.version>
+ <maven.shade.version>3.2.4</maven.shade.version>
+ <rat.version>0.13</rat.version>
+ <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
- <poi.version>4.1.2</poi.version>
++ <poi.version>5.0.1-SNAPSHOT</poi.version>
+ <commons.compress.version>1.20</commons.compress.version>
+ <commons.io.version>2.8.0</commons.io.version>
+ <commons.lang3.version>3.12.0</commons.lang3.version>
+ <gson.version>2.8.6</gson.version>
+ <guava.version>30.1.1-jre</guava.version>
+ <osgi.core.version>6.0.0</osgi.core.version>
- <cxf.version>3.4.2</cxf.version>
+ <cxf.version>3.4.3</cxf.version>
<slf4j.version>1.7.30</slf4j.version>
- <jackson.version>2.12.1</jackson.version>
+ <log4j.version>1.2.17</log4j.version>
+ <jackson.version>2.12.2</jackson.version>
<!-- when this is next upgraded, see if we can get rid of
javax.activation dependency in tika-server -->
- <jaxb.version>2.3.3</jaxb.version>
+ <jaxb.version>3.0.0</jaxb.version>
<cli.version>1.4</cli.version>
- <lucene.version>8.7.0</lucene.version>
- <mockito.version>3.7.7</mockito.version>
+ <lucene.version>8.8.1</lucene.version>
+ <mockito.version>3.8.0</mockito.version>
<opennlp.version>1.9.3</opennlp.version>
</properties>
diff --cc tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index e2dc17e,51b36c2..ddc607a
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@@ -22,8 -22,10 +22,9 @@@ import java.io.IOException
import java.io.InputStream;
import java.util.Locale;
+ import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
-import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.ooxml.POIXMLDocument;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
+import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
@@@ -113,10 -112,10 +116,10 @@@ public class OOXMLExtractorFactory
true, false)) {
try {
pkg = OPCPackage.open(rereadableInputStream);
- } catch (EOFException e) {
- } catch (EOFException|UnsupportedZipFeatureException e) {
++ } catch (EOFException| UnsupportedZipFeatureException e) {
rereadableInputStream.rewind();
tmpRepairedCopy = File.createTempFile("tika-ooxml-repair-", "");
- ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy);
+ ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false);
//if there isn't enough left to be opened as a package
//throw an exception -- we may want to fall back to streaming
//parsing