You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/06/29 14:10:07 UTC
[tika] branch TIKA-3164-1.x updated (7aa2732 -> bc9cca8)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch TIKA-3164-1.x
in repository https://gitbox.apache.org/repos/asf/tika.git.
from 7aa2732 TIKA-3164 -- WIP POI 5.0.0
add 3776e48 TIKA-3244 upgrade jackcess with upgrade to most recent jackcess-crypt; fix npe while at it.
add f64cbed TIKA-3244 -- general upgrades for 1.26
add 2550aac Merge remote-tracking branch 'origin/branch_1x' into branch_1x
add 617a33b TIKA-3244 update jackcess to continue on null table; make build work for Java > 8.
add de548d4 TIKA-3244: update mockito
add 1dc1583 Merge remote-tracking branch 'origin/branch_1x' into branch_1x
add 5a0df92 Update CHANGES for 1.26 release
add 012c54e Update CHANGES for 1.26 release
add 8053f70 [maven-release-plugin] prepare release 1.26-rc1
add 2fea0b0 [maven-release-plugin] prepare for next development iteration
add d786d95 rollback for 1.26-SNAPSHOT development
add faaaca7 add timeout threshold for fileprofiler
add 2475d07 TIKA-3314
add 9f059bd TIKA-3315 add xerces to tika-eval
add afea29c TIKA-3316 -- improve processing of XPS files
add f5a2aed TIKA-3320 -- allow case-insensitive header request matching for pdf/ocr config in tika-server
add f5eebe2 TIKA-3244: update snappy-java
add 5a7a4e7 TIKA-3244: update lombok, twelvemonkeys, jackson
add ed07a6e TIKA-3244: update tukaani
add 22cdffb TIKA-3244: update zstd
add 52b18f5 TIKA-3244: update commons-lang
add d615710 TIKA-3244: update spring
add 02ed830 TIKA-3244: update pax-url-aether
add 8081e6d TIKA-3318 Document the units of xmpDM:duration as seconds by default
add 21b3cf8 TIKA-3318 MP3 parser should output the xmpDM:duration metadata as seconds not milliseconds
add a4c9257 Changelog update
add b0242ee Backport to 1.x - TIKA-3310 Check if MP4 file's compatible brands match any of the expected values, from Peter Kronenberg
add a1ec3fd Changelog update
add 7ff6a49 Merge remote-tracking branch 'origin/branch_1x' into branch_1x
add 2e5b822 TIKA-3323 -- allow flexibility for 'file' command output on different operating systems
add e6bf840 Minor restructure and added missing javadoc (#416)
add 4f24ca5 TIKA-3327 Simple server metrics monitoring (#417)
add 32caedd TIKA-3327 Added missing locale (#418)
add 8bf65c0 TIKA-3332 -- recursively process the embedded file tree in PDFs.
add ecf502a TIKA-3325 -- writeLimit is now calculated on the full file (container and embedded documents), no longer on each.
add 88166ce TIKA-3331 -- return a more informative exception for encrypted ODT
add 7041f96 remove ProgressMonitorInputStream because it conceals the underlying TikaInputStream and wreaks havoc with reset and using underlying files, etc...
add 21f428d TIKA-3322 -- upgrade PDFBox to 2.0.23
add 6a27f3e TIKA-3244: update spring
add b63072c Merge remote-tracking branch 'origin/branch_1x' into branch_1x
add b29cce5 TIKA-3244 -- general upgrades for 1.26
add b1e8641 TIKA-3335 -- handle bad xml more robustly when checking for encryption
add 2b8c9a3 TIKA-3336 -- new zip bombs detect in 1.26-SNAPSHOT compared with 1.25 -- bug, don't advance twice per call to chars/whitespace
add da05576 TIKA-3334 -- fix thread safety bug in handling embedded docs in open office parser
add 8c21fba Update CHANGES.txt for 1.26 release
add 1842758 fix rat and imports for 1.26 release
add 2e83fd4 [maven-release-plugin] prepare release 1.26-rc1
add 1d09e95 [maven-release-plugin] prepare for next development iteration
add 74290b3 TIKA-3350
add 79d5448 TIKA-3244: update commons-net
add 7f83539 [TIKA-3344] [TIKA-3345] (#422)
add f51a905 Merge remote-tracking branch 'origin/branch_1x' into branch_1x
add 2006dc5 TIKA-3352: Add json output for /tika endpoint in tika-server
add d3ed76f TIKA-3355 -- integrate fakeload library into MockParser
add 9f58c30 TIKA-3355 -- include dependencies in test-jar
add f58b01f TIKA-3196 -- multithreading issue in package parser
add 82f7a34 [TIKA-3357] removes ambiguity by choosing handler based on produce type (#427)
add 3fa1953 TIKA-3357 -- add unit test
add 2704f0e TIKA-3374 add encoding detection to zip entry names via Ryan Liu.
add cd191ff update changes for new development in 1.27...please may it never happen... 2.0.0 here we come!
add 34fb775 Merge remote-tracking branch 'origin/branch_1x' into branch_1x
add 64cdc8e [TIKA-3353] Prometheus and JMX monitoring over micrometer (#429)
add 6f2f373 Merge remote-tracking branch 'origin/branch_1x' into branch_1x
add bc551b1 TIKA-3372 -- fix write limit in recursive parser handler
add f7d5119 TIKA-3373 Add the *.yml extension for YAML, which is commonly used, along with aliases for popular alternate mimetypes for it
add f414fe4 Merge remote-tracking branch 'origin/branch_1x' into branch_1x
add 64b2e4b Merge remote-tracking branch 'origin/branch_1x' into branch_1x
add 32545d4 TIKA-3376 improve handling of write limit reached in json output from /tika endpoint
add 509979b fix logic in ExtractComparer
add 2f80958 TIKA-3372 -- fix write limit handling in the PDFParser
add 7cae30e TIKA-3374 -- allow users to turn off charset detection of entry names in ZipArchives
add 2c4ba60 Backport Merge branch 'TIKA-3329' of https://github.com/thammegowda/tika into main
add b2f37ef Forbidden API error fixes.
add e281b4d avoid doubling content in the RecursiveParserWrapper by sending in a new Metadata object to the embedded parser handler
add 2186e67 TIKA-3382 -- improve writelimitreached handling in numerous parsers
add ba72c86 TIKA-3382 -- improve writelimitreached handling in numerous parsers -- clean up PDFParser
new ab86545 Merge remote-tracking branch 'origin/branch_1x' into TIKA-3164-1.x
add b7e53e4 TIKA-3386 allow a times parameter in MockParser
add 6efea4f TIKA-3386 -- fix conflicts
add 06c111f TIKA-3392 -- allow insecure parsing in MimeTypesReader
add c6998bd TIKA-3405 -- fix and upgrade isoparser
add 499e1c9 TIKA-3407 -- fix jaxb version and clean up related issues
add fd98eee TIKA-3441 -- prevent infinite loop on failure to bind to a port
add d7fa2cd TIKA-3441 -- improve likelihood that tesseract processes will be shutdown on crash.
add 27d0e3b TIKA-3433 -- extractAnnotationText should be settable via tikaconfig
add 6a9e726 TIKA-3437 -- deprecate experimental PDFPreflightParser
add 10c94ff fix for change in mp4parser behavior
add e8ec223 Merge remote-tracking branch 'origin/branch_1x' into branch_1x
add 1224f88 TIKA-3441 -- improve likelihood that tesseract processes will be shutdown on jvm restart.
add 90c6ea4 TIKA-3444 -- upgrade to pdfbox 2.0.24
add 4ba5fd7 TIKA-3456 -- LanguageDetector should chunk long strings and test for hasEnoughText.
add 57f5912 TIKA-3457 -- general upgrades for 1.27
new 702cf3e Merge remote-tracking branch 'origin/branch_1x' into TIKA-3164-1.x
new bc9cca8 TIKA-3164 -- further tweaks
The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
CHANGES.txt | 42 +-
NOTICE.txt | 24 +
pom.xml | 2 +-
tika-app/pom.xml | 2 +-
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +-
.../src/main/java/org/apache/tika/gui/TikaGUI.java | 3 -
tika-batch/pom.xml | 2 +-
.../batch/fs/RecursiveParserWrapperFSConsumer.java | 2 +-
tika-bundle/pom.xml | 46 +-
tika-core/pom.xml | 377 ++++---
.../main/assembly/test-jar-with-dependencies.xml | 38 +
...TikaException.java => RuntimeSAXException.java} | 15 +-
.../tika/exception/WriteLimitReachedException.java | 88 ++
.../tika/language/detect/LanguageDetector.java | 21 +-
.../main/java/org/apache/tika/metadata/XMPDM.java | 1 +
.../java/org/apache/tika/mime/MimeTypesReader.java | 18 +-
.../tika/parser/AbstractExternalProcessParser.java | 57 +
.../org/apache/tika/parser/CompositeParser.java | 4 +
.../apache/tika/parser/RecursiveParserWrapper.java | 120 ++-
.../sax/AbstractRecursiveParserWrapperHandler.java | 11 +-
.../tika/sax/BasicContentHandlerFactory.java | 3 +
.../tika/sax/RecursiveParserWrapperHandler.java | 9 +-
.../org/apache/tika/sax/SecureContentHandler.java | 2 +-
.../org/apache/tika/mime/tika-mimetypes.xml | 7 +
.../src/test/java/org/apache/tika/TikaTest.java | 15 +-
.../tika/detect/FileCommandDetectorTest.java | 40 +-
.../org/apache/tika/parser/mock/MockParser.java | 176 +++-
.../apache/tika/parser/mock/MockParserTest.java} | 29 +-
.../resources/test-documents/mock_fakeload.xml | 8 +-
.../test/resources/test-documents/mock_times.xml | 5 +-
tika-dl/pom.xml | 34 +-
tika-eval/pom.xml | 59 +-
.../java/org/apache/tika/eval/ExtractComparer.java | 2 +-
.../java/org/apache/tika/eval/FileProfiler.java | 25 +-
.../src/main/resources/lucene-char-mapping.txt | 3 +-
.../resources/tika-eval-file-profiler-config.xml | 10 +-
tika-example/pom.xml | 8 +-
tika-fuzzing/pom.xml | 2 +-
tika-java7/pom.xml | 2 +-
tika-langdetect/pom.xml | 8 +-
tika-nlp/pom.xml | 6 +-
tika-parent/pom.xml | 1108 ++++++++++----------
tika-parsers/pom.xml | 87 +-
.../apache/tika/parser/asm/XHTMLClassVisitor.java | 17 +-
.../java/org/apache/tika/parser/chm/ChmParser.java | 5 +-
.../org/apache/tika/parser/crypto/Pkcs7Parser.java | 4 +-
.../org/apache/tika/parser/crypto/TSDParser.java | 11 +-
.../tika/parser/microsoft/JackcessExtractor.java | 7 +
.../tika/parser/microsoft/JackcessParser.java | 2 +-
.../tika/parser/microsoft/SummaryExtractor.java | 1 -
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 12 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 18 +-
.../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 356 +++----
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 50 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 11 +-
.../microsoft/ooxml/xps/XPSExtractorDecorator.java | 5 +
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 9 +
.../microsoft/xml/AbstractXML2003Parser.java | 5 +-
.../java/org/apache/tika/parser/mp3/Mp3Parser.java | 13 +-
.../java/org/apache/tika/parser/mp4/MP4Parser.java | 26 +-
.../apache/tika/parser/ocr/TesseractOCRParser.java | 99 +-
...ndler.java => OpenDocumentManifestHandler.java} | 54 +-
.../apache/tika/parser/odf/OpenDocumentParser.java | 110 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 77 +-
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 2 +
.../java/org/apache/tika/parser/pdf/PDFParser.java | 3 +-
.../apache/tika/parser/pdf/PDFPreflightParser.java | 5 +
.../org/apache/tika/parser/pkg/PackageParser.java | 59 +-
.../parser/pkg/StreamingZipContainerDetector.java | 171 +--
.../tika/parser/pkg/ZipContainerDetector.java | 18 +-
.../org/apache/tika/parser/utils/ZipSalvager.java | 96 +-
.../tika/config/TikaEncodingDetectorTest.java | 6 +-
.../tika/detect/TestContainerAwareDetector.java | 2 +-
.../tika/parser/RecursiveParserWrapperTest.java | 15 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 2 -
.../parser/microsoft/ooxml/xps/XPSParserTest.java | 45 +
.../org/apache/tika/parser/mp3/Mp3ParserTest.java | 6 +-
.../org/apache/tika/parser/mp4/MP4ParserTest.java | 13 +-
.../org/apache/tika/parser/odf/ODFParserTest.java | 68 ++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 24 +
.../apache/tika/parser/pkg/PackageParserTest.java | 13 +-
.../tika/parser/pkg/ZipContainerDetectorTest.java | 2 +-
tika-parsers/src/test/resources/log4j.properties | 2 +-
.../pdf/tika-skip-annotations-config.xml} | 10 +-
.../src/test/resources/test-documents/gbk.zip | Bin 0 -> 432 bytes
.../resources/test-documents/testODTEncrypted.odt | Bin 0 -> 12714 bytes
.../testPDF_deeplyEmbeddedAttachments.pdf | Bin 0 -> 122221 bytes
.../test-documents/testXPSWithDataDescriptor.xps | Bin 0 -> 44523 bytes
.../test-documents/testXPSWithDataDescriptor2.xps | Bin 0 -> 51175 bytes
tika-serialization/pom.xml | 2 +-
tika-server/pom.xml | 52 +-
.../tika/server/ProduceTypeResourceComparator.java | 145 +++
.../java/org/apache/tika/server/TikaServerCli.java | 95 +-
.../org/apache/tika/server/TikaServerWatchDog.java | 74 +-
.../org/apache/tika/server/mbean/MBeanHelper.java | 60 ++
.../tika/server/mbean/ServerStatusExporter.java | 83 ++
.../server/mbean/ServerStatusExporterMBean.java | 49 +-
.../apache/tika/server/metrics/Log4JMetrics.java | 212 ++++
.../apache/tika/server/metrics/MetricsHelper.java | 220 ++++
.../tika/server/metrics/MetricsResource.java | 53 +
.../tika/server/metrics/ServerStatusMetrics.java | 61 ++
.../server/resource/RecursiveMetadataResource.java | 8 +-
.../apache/tika/server/resource/TikaResource.java | 206 +++-
.../tika/server/resource/TikaServerStatus.java | 21 +-
.../java/org/apache/tika/server/CXFTestBase.java | 20 +-
.../apache/tika/server/MetricsResourceTest.java | 118 +++
.../tika/server/RecursiveMetadataResourceTest.java | 143 ++-
.../org/apache/tika/server/StackTraceTest.java | 17 +-
.../server/TikaResourceMetadataFilterTest.java | 83 ++
.../tika/server/TikaResourceNoStackTest.java | 98 ++
.../org/apache/tika/server/TikaResourceTest.java | 322 +++++-
.../tika/server/TikaServerIntegrationTest.java | 55 +
.../resources/configs/metadata-filter-include.xml | 14 +-
.../mock/{fake_oom.xml => hello_world.xml} | 4 +-
.../mock/{fake_oom.xml => hello_world_long.xml} | 8 +-
tika-translate/pom.xml | 45 +-
.../tika/language/translate/RTGTranslator.java | 143 +++
.../org.apache.tika.language.translate.Translator | 1 +
...eTranslatorTest.java => RTGTranslatorTest.java} | 47 +-
tika-xmp/pom.xml | 2 +-
120 files changed, 4708 insertions(+), 1730 deletions(-)
create mode 100644 tika-core/src/main/assembly/test-jar-with-dependencies.xml
copy tika-core/src/main/java/org/apache/tika/exception/{TikaException.java => RuntimeSAXException.java} (77%)
create mode 100644 tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
create mode 100644 tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java
copy tika-core/src/{main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java => test/java/org/apache/tika/parser/mock/MockParserTest.java} (58%)
copy tika-parsers/src/test/resources/test-documents/mock/fake_oom.xml => tika-core/src/test/resources/test-documents/mock_fakeload.xml (73%)
copy tika-batch/src/test/resources/test-input/max_restarts/test1_oom.xml => tika-core/src/test/resources/test-documents/mock_times.xml (86%)
copy tika-parsers/src/main/java/org/apache/tika/parser/odf/{OpenDocumentMacroHandler.java => OpenDocumentManifestHandler.java} (50%)
copy tika-parsers/src/test/resources/org/apache/tika/{config/TIKA-1558-excludesub.xml => parser/pdf/tika-skip-annotations-config.xml} (78%)
create mode 100644 tika-parsers/src/test/resources/test-documents/gbk.zip
create mode 100644 tika-parsers/src/test/resources/test-documents/testODTEncrypted.odt
create mode 100644 tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf
create mode 100644 tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor.xps
create mode 100644 tika-parsers/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps
create mode 100644 tika-server/src/main/java/org/apache/tika/server/ProduceTypeResourceComparator.java
create mode 100644 tika-server/src/main/java/org/apache/tika/server/mbean/MBeanHelper.java
create mode 100644 tika-server/src/main/java/org/apache/tika/server/mbean/ServerStatusExporter.java
copy tika-core/src/main/java/org/apache/tika/metadata/Geographic.java => tika-server/src/main/java/org/apache/tika/server/mbean/ServerStatusExporterMBean.java (50%)
create mode 100644 tika-server/src/main/java/org/apache/tika/server/metrics/Log4JMetrics.java
create mode 100644 tika-server/src/main/java/org/apache/tika/server/metrics/MetricsHelper.java
create mode 100644 tika-server/src/main/java/org/apache/tika/server/metrics/MetricsResource.java
create mode 100644 tika-server/src/main/java/org/apache/tika/server/metrics/ServerStatusMetrics.java
create mode 100644 tika-server/src/test/java/org/apache/tika/server/MetricsResourceTest.java
create mode 100644 tika-server/src/test/java/org/apache/tika/server/TikaResourceMetadataFilterTest.java
create mode 100644 tika-server/src/test/java/org/apache/tika/server/TikaResourceNoStackTest.java
copy tika-core/src/test/resources/org/apache/tika/config/TIKA-1986-bad-parameters.xml => tika-server/src/test/resources/configs/metadata-filter-include.xml (68%)
copy tika-server/src/test/resources/mock/{fake_oom.xml => hello_world.xml} (83%)
copy tika-server/src/test/resources/mock/{fake_oom.xml => hello_world_long.xml} (58%)
create mode 100644 tika-translate/src/main/java/org/apache/tika/language/translate/RTGTranslator.java
copy tika-translate/src/test/java/org/apache/tika/language/translate/{GoogleTranslatorTest.java => RTGTranslatorTest.java} (62%)
[tika] 03/03: TIKA-3164 -- further tweaks
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3164-1.x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit bc9cca8761d20bd9a9f2f54ebcaf89ba093e2c82
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jun 29 10:09:41 2021 -0400
TIKA-3164 -- further tweaks
---
tika-bundle/pom.xml | 23 ++++++++++++++++++-
tika-eval/pom.xml | 26 +++++++++++++++++++++-
tika-parsers/pom.xml | 20 +++++++++++++++++
.../tika/parser/microsoft/SummaryExtractor.java | 1 -
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 1 -
.../microsoft/ooxml/OOXMLExtractorFactory.java | 9 +++-----
.../parser/microsoft/ooxml/OOXMLParserTest.java | 2 --
tika-parsers/src/test/resources/log4j.properties | 2 +-
.../apache/tika/server/resource/TikaResource.java | 2 +-
9 files changed, 72 insertions(+), 14 deletions(-)
diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index c90cc85..aecfb25 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -203,7 +203,8 @@
bcpkix-jdk15on|
poi|poi-scratchpad|
poi-ooxml|
- poi-ooxml-schemas|
+ poi-ooxml-lite|
+ log4j-api|
commons-math3|
curvesapi|
xmlbeans|
@@ -266,6 +267,7 @@
!org.apache.spark.ml.*,
!org.apache.spark.mllib.*,
!org.apache.spark.sql.*,
+ !com.github.javaparser.*,
org.apache.tika.mime,
org.apache.tika.fork,
android.util;resolution:=optional,
@@ -287,11 +289,14 @@
com.parso;resolution:=optional,
com.sleepycat.je;resolution:=optional,
com.sun.javadoc;resolution:=optional,
+ com.sun.org.apache.xml.internal.resolver;resolution:=optional,
+ com.sun.org.apache.xml.internal.resolver.tools;resolution:=optional,
com.sun.xml.bind.marshaller;resolution:=optional,
com.sun.xml.internal.bind.marshaller;resolution:=optional,
com.sun.msv.datatype;resolution:=optional,
com.sun.msv.datatype.xsd;resolution:=optional,
com.sun.tools.javadoc;resolution:=optional,
+ de.rototor.pdfbox.graphics2d;resolution:=optional,
edu.mit.ll.mitie;resolution:=optional,
edu.stanford.nlp.*;resolution:=optional,
edu.wisc.ssec.mcidas;resolution:=optional,
@@ -324,15 +329,22 @@
net.didion.jwnl;resolution:=optional,
net.sf.saxon;resolution:=optional,
net.sf.saxon.dom;resolution:=optional,
+ net.sf.saxon.lib;resolution:=optional,
+ net.sf.saxon.ma.map;resolution:=optional,
net.sf.saxon.om;resolution:=optional,
net.sf.saxon.query;resolution:=optional,
net.sf.saxon.sxpath;resolution:=optional,
+ net.sf.saxon.trans;resolution:=optional,
+ net.sf.saxon.tree.wrapper;resolution:=optional,
+ net.sf.saxon.type;resolution:=optional,
net.sf.saxon.value;resolution:=optional,
org.apache.batik.anim.dom;resolution:=optional,
org.apache.batik.bridge;resolution:=optional,
+ org.apache.batik.dom;resolution:=optional,
org.apache.batik.ext.awt;resolution:=optional,
org.apache.batik.ext.awt.image.renderable;resolution:=optional,
org.apache.batik.gvt;resolution:=optional,
+ org.apache.batik.svggen;resolution:=optional,
org.apache.batik.util;resolution:=optional,
org.apache.cxf.jaxrs.client;resolution:=optional,
org.apache.cxf.jaxrs.ext.multipart;resolution:=optional,
@@ -349,6 +361,14 @@
org.apache.commons.vfs2.util;resolution:=optional,
org.apache.crimson.jaxp;resolution:=optional,
org.apache.jcp.xml.dsig.internal.dom;resolution:=optional,
+ org.apache.logging.log4j;resolution:=optional,
+ org.apache.logging.log4j.message;resolution:=optional,
+ org.apache.logging.log4j.util;resolution:=optional,
+ org.apache.logging.log4j.util.internal;resolution:=optional,
+ org.apache.maven.model;resolution:=optional,
+ org.apache.maven.plugin;resolution:=optional,
+ org.apache.maven.plugin.logging;resolution:=optional,
+ org.apache.maven.project;resolution:=optional,
org.apache.pdfbox.debugger;resolution:=optional,
org.apache.pdfbox.preflight.*;resolution:=optional,
org.apache.sis;resolution:=optional,
@@ -438,6 +458,7 @@
org.slf4j.helpers;resolution:=optional,
org.sqlite;resolution:=optional,
org.w3c.dom;resolution:=optional,
+ org.w3c.dom.traversal;resolution:=optional,
org.relaxng.datatype;resolution:=optional,
org.xml.sax;resolution:=optional,
org.xml.sax.ext;resolution:=optional,
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index d86c365..08359bf 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -124,11 +124,35 @@
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-all</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-bridge</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-svggen</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-codec</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>pdfbox</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>jakarta.xml.bind</groupId>
+ <artifactId>jakarta.xml.bind-api</artifactId>
+ </exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
- <artifactId>poi-ooxml-schemas</artifactId>
+ <artifactId>poi-ooxml-lite</artifactId>
<version>${poi.version}</version>
</dependency>
<dependency>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index cb0e473..693515e 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -275,6 +275,26 @@
<groupId>org.apache.xmlgraphics</groupId>
<artifactId>batik-all</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-bridge</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-svggen</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.xmlgraphics</groupId>
+ <artifactId>batik-codec</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>pdfbox</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>jakarta.xml.bind</groupId>
+ <artifactId>jakarta.xml.bind-api</artifactId>
+ </exclusion>
</exclusions>
</dependency>
<dependency>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
index ba98c0e..30c472d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
@@ -24,7 +24,6 @@ import java.util.Set;
import org.apache.poi.hpsf.CustomProperties;
import org.apache.poi.hpsf.DocumentSummaryInformation;
-import org.apache.poi.hpsf.MarkUnsupportedException;
import org.apache.poi.hpsf.NoPropertySetStreamException;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.SummaryInformation;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index b404da7..ba43c31 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -380,7 +380,6 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
throws SAXException, IOException {
Metadata metadata = new Metadata();
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
-
// Get the name
String name = part.getPartName().getName();
metadata.set(
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index ddc607a..fd265f5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -65,8 +65,6 @@ import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import static org.apache.poi.ooxml.extractor.POIXMLExtractorFactory.setThreadPrefersEventExtractors;
-
/**
* Figures out the correct {@link OOXMLExtractor} for the supplied document and
* returns it.
@@ -78,7 +76,7 @@ public class OOXMLExtractorFactory {
private static POIXMLExtractorFactory EXTRACTOR_FACTORY = new POIXMLExtractorFactory();
static {
- setThreadPrefersEventExtractors(true);
+ POIXMLExtractorFactory.setAllThreadsPreferEventExtractors(true);
}
public static void parse(
@@ -176,7 +174,6 @@ public class OOXMLExtractorFactory {
if (poiExtractor == null) {
poiExtractor = EXTRACTOR_FACTORY.create(pkg);
}
-
POIXMLDocument document = poiExtractor.getDocument();
if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
@@ -212,7 +209,6 @@ public class OOXMLExtractorFactory {
// Get the bulk of the metadata first, so that it's accessible during
// parsing if desired by the client (see TIKA-1109)
extractor.getMetadataExtractor().extract(metadata);
-
// Extract the text, along with any in-document metadata
extractor.getXHTML(baseHandler, metadata, context);
} catch (IllegalArgumentException e) {
@@ -291,7 +287,8 @@ public class OOXMLExtractorFactory {
//TODO make this static...or find what happened to SUPPORTED_TYPES
XSLFRelation[] xslfRelations = new XSLFRelation[] {
XSLFRelation.MAIN, XSLFRelation.MACRO, XSLFRelation.MACRO_TEMPLATE,
- XSLFRelation.PRESENTATIONML_TEMPLATE
+ XSLFRelation.PRESENTATIONML,
+ XSLFRelation.PRESENTATIONML_TEMPLATE, XSLFRelation.PRESENTATION_MACRO
};
for (int i = 0; i < xslfRelations.length; i++) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index bdbc9e4..3e007b9 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -31,7 +31,6 @@ import java.io.File;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.StringWriter;
-import java.nio.file.Path;
import java.text.DecimalFormatSymbols;
import java.util.Arrays;
import java.util.HashMap;
@@ -43,7 +42,6 @@ import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.ctakes.typesystem.type.syntax.O;
import org.apache.poi.util.LocaleUtil;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
diff --git a/tika-parsers/src/test/resources/log4j.properties b/tika-parsers/src/test/resources/log4j.properties
index f2c0b92..d557c48 100644
--- a/tika-parsers/src/test/resources/log4j.properties
+++ b/tika-parsers/src/test/resources/log4j.properties
@@ -21,4 +21,4 @@ log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
# Pattern to output the caller's file name and line number.
-log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n
+log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) ----- %m%n
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index b326f7f..c77694d 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -22,7 +22,6 @@ import org.apache.commons.lang3.StringUtils;
import org.apache.cxf.attachment.ContentDisposition;
import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.cxf.jaxrs.impl.MetadataMap;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
@@ -55,6 +54,7 @@ import org.apache.tika.server.ServerStatus;
import org.apache.tika.server.TikaServerParseException;
import org.apache.tika.utils.ExceptionUtils;
+import org.apache.poi.extractor.ExtractorFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
[tika] 02/03: Merge remote-tracking branch 'origin/branch_1x' into
TIKA-3164-1.x
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3164-1.x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 702cf3e7101f500a6f176e03921573671504acd3
Merge: ab86545 57f5912
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jun 29 09:26:41 2021 -0400
Merge remote-tracking branch 'origin/branch_1x' into TIKA-3164-1.x
CHANGES.txt | 8 ++
tika-bundle/pom.xml | 17 +++-
.../tika/language/detect/LanguageDetector.java | 21 ++++-
.../java/org/apache/tika/mime/MimeTypesReader.java | 18 +++-
.../tika/parser/AbstractExternalProcessParser.java | 57 +++++++++++++
.../org/apache/tika/parser/mock/MockParser.java | 13 ++-
.../apache/tika/parser/mock/MockParserTest.java | 11 +++
.../test/resources/test-documents/mock_times.xml | 26 ++++++
tika-dl/pom.xml | 24 ++++++
tika-example/pom.xml | 6 +-
tika-langdetect/pom.xml | 10 ---
tika-parent/pom.xml | 41 ++++-----
tika-parsers/pom.xml | 56 +++---------
.../apache/tika/parser/ocr/TesseractOCRParser.java | 99 +++++++++++++---------
.../java/org/apache/tika/parser/pdf/PDFParser.java | 1 +
.../apache/tika/parser/pdf/PDFPreflightParser.java | 5 ++
.../org/apache/tika/parser/mp4/MP4ParserTest.java | 13 ++-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 13 +++
.../parser/pdf/tika-skip-annotations-config.xml | 28 ++++++
tika-server/pom.xml | 66 +--------------
.../java/org/apache/tika/server/TikaServerCli.java | 60 +++++++++----
.../org/apache/tika/server/TikaServerWatchDog.java | 74 ++++++++++++----
.../tika/server/TikaServerIntegrationTest.java | 55 ++++++++++++
tika-translate/pom.xml | 47 +---------
24 files changed, 500 insertions(+), 269 deletions(-)
diff --cc tika-parent/pom.xml
index 71a5928,1569910..6d3fadc
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@@ -334,11 -334,11 +334,11 @@@
<maven.shade.version>3.2.4</maven.shade.version>
<rat.version>0.13</rat.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
- <poi.version>4.1.2</poi.version>
+ <poi.version>5.0.1-SNAPSHOT</poi.version>
<commons.compress.version>1.20</commons.compress.version>
- <commons.io.version>2.8.0</commons.io.version>
+ <commons.io.version>2.10.0</commons.io.version>
<commons.lang3.version>3.12.0</commons.lang3.version>
- <gson.version>2.8.6</gson.version>
+ <gson.version>2.8.7</gson.version>
<guava.version>30.1.1-jre</guava.version>
<osgi.core.version>6.0.0</osgi.core.version>
[tika] 01/03: Merge remote-tracking branch 'origin/branch_1x' into
TIKA-3164-1.x
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3164-1.x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit ab86545afec5cef3a05e546e4e667ea843a738ae
Merge: 7aa2732 ba72c86
Author: tallison <ta...@apache.org>
AuthorDate: Wed May 5 10:32:48 2021 -0400
Merge remote-tracking branch 'origin/branch_1x' into TIKA-3164-1.x
# Conflicts:
# tika-parent/pom.xml
# tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
CHANGES.txt | 34 +-
NOTICE.txt | 24 +
pom.xml | 2 +-
tika-app/pom.xml | 2 +-
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +-
.../src/main/java/org/apache/tika/gui/TikaGUI.java | 3 -
tika-batch/pom.xml | 2 +-
.../batch/fs/RecursiveParserWrapperFSConsumer.java | 2 +-
tika-bundle/pom.xml | 6 +-
tika-core/pom.xml | 377 ++++---
.../main/assembly/test-jar-with-dependencies.xml | 38 +
.../apache/tika/exception/RuntimeSAXException.java | 29 +
.../tika/exception/WriteLimitReachedException.java | 88 ++
.../main/java/org/apache/tika/metadata/XMPDM.java | 1 +
.../org/apache/tika/parser/CompositeParser.java | 4 +
.../apache/tika/parser/RecursiveParserWrapper.java | 120 ++-
.../sax/AbstractRecursiveParserWrapperHandler.java | 11 +-
.../tika/sax/BasicContentHandlerFactory.java | 3 +
.../tika/sax/RecursiveParserWrapperHandler.java | 9 +-
.../org/apache/tika/sax/SecureContentHandler.java | 2 +-
.../org/apache/tika/mime/tika-mimetypes.xml | 7 +
.../src/test/java/org/apache/tika/TikaTest.java | 15 +-
.../tika/detect/FileCommandDetectorTest.java | 40 +-
.../org/apache/tika/parser/mock/MockParser.java | 163 ++-
.../apache/tika/parser/mock/MockParserTest.java | 30 +
.../resources/test-documents/mock_fakeload.xml | 29 +
tika-dl/pom.xml | 10 +-
tika-eval/pom.xml | 33 +-
.../java/org/apache/tika/eval/ExtractComparer.java | 2 +-
.../java/org/apache/tika/eval/FileProfiler.java | 25 +-
.../src/main/resources/lucene-char-mapping.txt | 3 +-
.../resources/tika-eval-file-profiler-config.xml | 10 +-
tika-example/pom.xml | 4 +-
tika-fuzzing/pom.xml | 2 +-
tika-java7/pom.xml | 2 +-
tika-langdetect/pom.xml | 6 +-
tika-nlp/pom.xml | 6 +-
tika-parent/pom.xml | 1103 ++++++++++----------
tika-parsers/pom.xml | 37 +-
.../apache/tika/parser/asm/XHTMLClassVisitor.java | 17 +-
.../java/org/apache/tika/parser/chm/ChmParser.java | 5 +-
.../org/apache/tika/parser/crypto/Pkcs7Parser.java | 4 +-
.../org/apache/tika/parser/crypto/TSDParser.java | 11 +-
.../tika/parser/microsoft/JackcessExtractor.java | 7 +
.../tika/parser/microsoft/JackcessParser.java | 2 +-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 11 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 9 +-
.../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 356 +++----
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 50 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 11 +-
.../microsoft/ooxml/xps/XPSExtractorDecorator.java | 5 +
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 9 +
.../microsoft/xml/AbstractXML2003Parser.java | 5 +-
.../java/org/apache/tika/parser/mp3/Mp3Parser.java | 13 +-
.../java/org/apache/tika/parser/mp4/MP4Parser.java | 26 +-
.../parser/odf/OpenDocumentManifestHandler.java | 54 +
.../apache/tika/parser/odf/OpenDocumentParser.java | 110 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 77 +-
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 2 +
.../java/org/apache/tika/parser/pdf/PDFParser.java | 2 +-
.../org/apache/tika/parser/pkg/PackageParser.java | 59 +-
.../parser/pkg/StreamingZipContainerDetector.java | 171 +--
.../tika/parser/pkg/ZipContainerDetector.java | 18 +-
.../org/apache/tika/parser/utils/ZipSalvager.java | 96 +-
.../tika/config/TikaEncodingDetectorTest.java | 6 +-
.../tika/detect/TestContainerAwareDetector.java | 2 +-
.../tika/parser/RecursiveParserWrapperTest.java | 15 +-
.../parser/microsoft/ooxml/xps/XPSParserTest.java | 45 +
.../org/apache/tika/parser/mp3/Mp3ParserTest.java | 6 +-
.../org/apache/tika/parser/odf/ODFParserTest.java | 68 ++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 11 +
.../apache/tika/parser/pkg/PackageParserTest.java | 13 +-
.../tika/parser/pkg/ZipContainerDetectorTest.java | 2 +-
.../src/test/resources/test-documents/gbk.zip | Bin 0 -> 432 bytes
.../resources/test-documents/testODTEncrypted.odt | Bin 0 -> 12714 bytes
.../testPDF_deeplyEmbeddedAttachments.pdf | Bin 0 -> 122221 bytes
.../test-documents/testXPSWithDataDescriptor.xps | Bin 0 -> 44523 bytes
.../test-documents/testXPSWithDataDescriptor2.xps | Bin 0 -> 51175 bytes
tika-serialization/pom.xml | 2 +-
tika-server/pom.xml | 88 +-
.../tika/server/ProduceTypeResourceComparator.java | 145 +++
.../java/org/apache/tika/server/TikaServerCli.java | 35 +-
.../org/apache/tika/server/mbean/MBeanHelper.java | 60 ++
.../tika/server/mbean/ServerStatusExporter.java | 83 ++
.../server/mbean/ServerStatusExporterMBean.java | 61 ++
.../apache/tika/server/metrics/Log4JMetrics.java | 212 ++++
.../apache/tika/server/metrics/MetricsHelper.java | 220 ++++
.../tika/server/metrics/MetricsResource.java | 53 +
.../tika/server/metrics/ServerStatusMetrics.java | 61 ++
.../server/resource/RecursiveMetadataResource.java | 8 +-
.../apache/tika/server/resource/TikaResource.java | 204 +++-
.../tika/server/resource/TikaServerStatus.java | 21 +-
.../java/org/apache/tika/server/CXFTestBase.java | 20 +-
.../apache/tika/server/MetricsResourceTest.java | 118 +++
.../tika/server/RecursiveMetadataResourceTest.java | 143 ++-
.../org/apache/tika/server/StackTraceTest.java | 17 +-
.../server/TikaResourceMetadataFilterTest.java | 83 ++
.../tika/server/TikaResourceNoStackTest.java | 98 ++
.../org/apache/tika/server/TikaResourceTest.java | 322 +++++-
.../resources/configs/metadata-filter-include.xml | 30 +
.../src/test/resources/mock/hello_world.xml | 26 +
.../src/test/resources/mock/hello_world_long.xml | 30 +
tika-translate/pom.xml | 8 +-
.../tika/language/translate/RTGTranslator.java | 143 +++
.../org.apache.tika.language.translate.Translator | 1 +
.../tika/language/translate/RTGTranslatorTest.java | 64 ++
tika-xmp/pom.xml | 2 +-
107 files changed, 4519 insertions(+), 1423 deletions(-)
diff --cc tika-parent/pom.xml
index baccf43,7d3cb93..71a5928
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@@ -20,337 -20,338 +20,338 @@@
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
- <modelVersion>4.0.0</modelVersion>
+ <modelVersion>4.0.0</modelVersion>
- <parent>
- <groupId>org.apache</groupId>
- <artifactId>apache</artifactId>
- <version>17</version>
- <relativePath />
- </parent>
+ <parent>
+ <groupId>org.apache</groupId>
+ <artifactId>apache</artifactId>
+ <version>17</version>
+ <relativePath />
+ </parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parent</artifactId>
- <version>1.26-SNAPSHOT</version>
- <packaging>pom</packaging>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parent</artifactId>
+ <version>1.27-SNAPSHOT</version>
+ <packaging>pom</packaging>
- <name>Apache Tika parent</name>
- <description>
- Apache Tika is a toolkit for detecting and extracting metadata and
- structured text content from various documents using existing parser
- libraries.
- </description>
- <inceptionYear>2007</inceptionYear>
+ <name>Apache Tika parent</name>
+ <description>
+ Apache Tika is a toolkit for detecting and extracting metadata and
+ structured text content from various documents using existing parser
+ libraries.
+ </description>
+ <inceptionYear>2007</inceptionYear>
- <url>http://tika.apache.org/</url>
+ <url>http://tika.apache.org/</url>
- <issueManagement>
- <system>JIRA</system>
- <url>https://issues.apache.org/jira/browse/TIKA</url>
- </issueManagement>
+ <issueManagement>
+ <system>JIRA</system>
+ <url>https://issues.apache.org/jira/browse/TIKA</url>
+ </issueManagement>
- <mailingLists>
- <mailingList>
- <name>Development mailing list</name>
- <subscribe>dev-subscribe@tika.apache.org</subscribe>
- <unsubscribe>dev-unsubscribe@tika.apache.org</unsubscribe>
- <post>dev@tika.apache.org</post>
- <archive>https://lists.apache.org/list.html?dev@tika.apache.org</archive>
- <otherArchives>
- <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-dev/</otherArchive>
- <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-dev</otherArchive>
- <otherArchive>http://www.mail-archive.com/dev@tika.apache.org</otherArchive>
- <otherArchive>http://www.mail-archive.com/tika-dev@lucene.apache.org/</otherArchive>
- <otherArchive>http://www.mail-archive.com/tika-dev@incubator.apache.org/</otherArchive>
- <otherArchive>http://www.nabble.com/Apache-Tika---Development-f20913.html</otherArchive>
- <otherArchive>http://news.gmane.org/gmane.comp.apache.tika.devel</otherArchive>
- <otherArchive>http://tika.markmail.org/</otherArchive>
- </otherArchives>
- </mailingList>
- <mailingList>
- <name>Commit mailing list</name>
- <subscribe>commits-subscribe@tika.apache.org</subscribe>
- <unsubscribe>commits-unsubscribe@tika.apache.org</unsubscribe>
- <post>commits@tika.apache.org</post>
- <archive>https://lists.apache.org/list.html?commits@tika.apache.org</archive>
- <otherArchives>
- <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-commits/</otherArchive>
- <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-commits/</otherArchive>
- <otherArchive>http://www.mail-archive.com/tika-commits@lucene.apache.org/</otherArchive>
- <otherArchive>http://www.mail-archive.com/tika-commits@incubator.apache.org/</otherArchive>
- </otherArchives>
- </mailingList>
- <mailingList>
- <name>User mailing list</name>
- <subscribe>user-subscribe@tika.apache.org</subscribe>
- <unsubscribe>user-unsubscribe@tika.apache.org</unsubscribe>
- <post>user@tika.apache.org</post>
- <archive>https://lists.apache.org/list.html?user@tika.apache.org</archive>
- <otherArchives>
- <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-user/</otherArchive>
- <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-user/</otherArchive>
- <otherArchive>http://www.mail-archive.com/tika-user@lucene.apache.org/</otherArchive>
- </otherArchives>
- </mailingList>
- </mailingLists>
+ <mailingLists>
+ <mailingList>
+ <name>Development mailing list</name>
+ <subscribe>dev-subscribe@tika.apache.org</subscribe>
+ <unsubscribe>dev-unsubscribe@tika.apache.org</unsubscribe>
+ <post>dev@tika.apache.org</post>
+ <archive>https://lists.apache.org/list.html?dev@tika.apache.org</archive>
+ <otherArchives>
+ <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-dev/</otherArchive>
+ <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-dev</otherArchive>
+ <otherArchive>http://www.mail-archive.com/dev@tika.apache.org</otherArchive>
+ <otherArchive>http://www.mail-archive.com/tika-dev@lucene.apache.org/</otherArchive>
+ <otherArchive>http://www.mail-archive.com/tika-dev@incubator.apache.org/</otherArchive>
+ <otherArchive>http://www.nabble.com/Apache-Tika---Development-f20913.html</otherArchive>
+ <otherArchive>http://news.gmane.org/gmane.comp.apache.tika.devel</otherArchive>
+ <otherArchive>http://tika.markmail.org/</otherArchive>
+ </otherArchives>
+ </mailingList>
+ <mailingList>
+ <name>Commit mailing list</name>
+ <subscribe>commits-subscribe@tika.apache.org</subscribe>
+ <unsubscribe>commits-unsubscribe@tika.apache.org</unsubscribe>
+ <post>commits@tika.apache.org</post>
+ <archive>https://lists.apache.org/list.html?commits@tika.apache.org</archive>
+ <otherArchives>
+ <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-commits/</otherArchive>
+ <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-commits/</otherArchive>
+ <otherArchive>http://www.mail-archive.com/tika-commits@lucene.apache.org/</otherArchive>
+ <otherArchive>http://www.mail-archive.com/tika-commits@incubator.apache.org/</otherArchive>
+ </otherArchives>
+ </mailingList>
+ <mailingList>
+ <name>User mailing list</name>
+ <subscribe>user-subscribe@tika.apache.org</subscribe>
+ <unsubscribe>user-unsubscribe@tika.apache.org</unsubscribe>
+ <post>user@tika.apache.org</post>
+ <archive>https://lists.apache.org/list.html?user@tika.apache.org</archive>
+ <otherArchives>
+ <otherArchive>http://mail-archives.apache.org/mod_mbox/tika-user/</otherArchive>
+ <otherArchive>http://mail-archives.apache.org/mod_mbox/lucene-tika-user/</otherArchive>
+ <otherArchive>http://www.mail-archive.com/tika-user@lucene.apache.org/</otherArchive>
+ </otherArchives>
+ </mailingList>
+ </mailingLists>
- <developers>
- <developer>
- <name>Rida Benjelloun</name>
- <id>ridabenjelloun</id>
- <email>ridabenjelloun@apache.org</email>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Keith Bennett</name>
- <id>kbennett</id>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Mark Harwood</name>
- <id>mharwood</id>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Ken Krugler</name>
- <id>kkrugler</id>
- <email>kkrugler@apache.org</email>
- <url>http://ken-blog.krugler.org</url>
- <organization>Scale Unlimited</organization>
- <organizationUrl>http://www.scaleunlimited.com</organizationUrl>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Chris A. Mattmann</name>
- <id>mattmann</id>
- <email>mattmann@apache.org</email>
- <url>http://people.apache.org/~mattmann/</url>
- <organization>NASA Jet Propulsion Laboratory</organization>
- <organizationUrl>http://www.jpl.nasa.gov</organizationUrl>
- <timezone>-8</timezone>
- <properties />
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Michael McCandless</name>
- <id>mikemccand</id>
- <email>mikemccand@apache.org</email>
- <organization>IBM</organization>
- <properties />
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Dave Meikle</name>
- <id>dmeikle</id>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Sami Siren</name>
- <id>siren</id>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Jukka Zitting</name>
- <id>jukka</id>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Nick Burch</name>
- <id>nick</id>
- <organization>Alfresco</organization>
- <organizationUrl>http://alfresco.com</organizationUrl>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Maxim Valyanskiy</name>
- <id>maxcom</id>
- <organization>Jet Infosystems</organization>
- <roles>
- <role>committer</role>
- </roles>
- <timezone>+3</timezone>
- </developer>
- <developer>
- <name>Oleg Tikhonov</name>
- <id>oleg</id>
- <roles>
- <role>committer</role>
- </roles>
- <timezone>+2</timezone>
- </developer>
- <developer>
- <name>Ray Gauss II</name>
- <id>rgauss</id>
- <organization>Alfresco</organization>
- <organizationUrl>http://alfresco.com</organizationUrl>
- <timezone>-5</timezone>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Tyler Palsulich</name>
- <id>tpalsulich</id>
- <timezone>-8</timezone>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Tim Allison</name>
- <id>tallison</id>
- <timezone>-5</timezone>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- <developer>
- <name>Konstantin Gribov</name>
- <id>grossws</id>
- <timezone>+3</timezone>
- <roles>
- <role>committer</role>
- </roles>
- </developer>
- </developers>
- <contributors>
- <contributor>
- <name>Doug Cutting</name>
- <roles>
- <role>mentor</role>
- </roles>
- </contributor>
- <contributor>
- <name>Bertrand Delacretaz</name>
- <roles>
- <role>mentor</role>
- </roles>
- </contributor>
- <contributor>
- <name>Niall Pemberton</name>
- <roles>
- <role>emeritus</role>
- </roles>
- </contributor>
- </contributors>
+ <developers>
+ <developer>
+ <name>Rida Benjelloun</name>
+ <id>ridabenjelloun</id>
+ <email>ridabenjelloun@apache.org</email>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Keith Bennett</name>
+ <id>kbennett</id>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Mark Harwood</name>
+ <id>mharwood</id>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Ken Krugler</name>
+ <id>kkrugler</id>
+ <email>kkrugler@apache.org</email>
+ <url>http://ken-blog.krugler.org</url>
+ <organization>Scale Unlimited</organization>
+ <organizationUrl>http://www.scaleunlimited.com</organizationUrl>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Chris A. Mattmann</name>
+ <id>mattmann</id>
+ <email>mattmann@apache.org</email>
+ <url>http://people.apache.org/~mattmann/</url>
+ <organization>NASA Jet Propulsion Laboratory</organization>
+ <organizationUrl>http://www.jpl.nasa.gov</organizationUrl>
+ <timezone>-8</timezone>
+ <properties />
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Michael McCandless</name>
+ <id>mikemccand</id>
+ <email>mikemccand@apache.org</email>
+ <organization>IBM</organization>
+ <properties />
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Dave Meikle</name>
+ <id>dmeikle</id>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Sami Siren</name>
+ <id>siren</id>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Jukka Zitting</name>
+ <id>jukka</id>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Nick Burch</name>
+ <id>nick</id>
+ <organization>Alfresco</organization>
+ <organizationUrl>http://alfresco.com</organizationUrl>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Maxim Valyanskiy</name>
+ <id>maxcom</id>
+ <organization>Jet Infosystems</organization>
+ <roles>
+ <role>committer</role>
+ </roles>
+ <timezone>+3</timezone>
+ </developer>
+ <developer>
+ <name>Oleg Tikhonov</name>
+ <id>oleg</id>
+ <roles>
+ <role>committer</role>
+ </roles>
+ <timezone>+2</timezone>
+ </developer>
+ <developer>
+ <name>Ray Gauss II</name>
+ <id>rgauss</id>
+ <organization>Alfresco</organization>
+ <organizationUrl>http://alfresco.com</organizationUrl>
+ <timezone>-5</timezone>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Tyler Palsulich</name>
+ <id>tpalsulich</id>
+ <timezone>-8</timezone>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Tim Allison</name>
+ <id>tallison</id>
+ <timezone>-5</timezone>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ <developer>
+ <name>Konstantin Gribov</name>
+ <id>grossws</id>
+ <timezone>+3</timezone>
+ <roles>
+ <role>committer</role>
+ </roles>
+ </developer>
+ </developers>
+ <contributors>
+ <contributor>
+ <name>Doug Cutting</name>
+ <roles>
+ <role>mentor</role>
+ </roles>
+ </contributor>
+ <contributor>
+ <name>Bertrand Delacretaz</name>
+ <roles>
+ <role>mentor</role>
+ </roles>
+ </contributor>
+ <contributor>
+ <name>Niall Pemberton</name>
+ <roles>
+ <role>emeritus</role>
+ </roles>
+ </contributor>
+ </contributors>
- <dependencyManagement>
- <dependencies>
- <dependency>
- <groupId>biz.aQute</groupId>
- <artifactId>bndlib</artifactId>
- <version>1.50.0</version>
- </dependency>
- <dependency>
- <groupId>org.apache.felix</groupId>
- <artifactId>org.apache.felix.scr.annotations</artifactId>
- <version>1.12.0</version>
- </dependency>
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>4.13.1</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- <version>${slf4j.version}</version>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- <version>${slf4j.version}</version>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-simple</artifactId>
- <version>${slf4j.version}</version>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>jcl-over-slf4j</artifactId>
- <version>${slf4j.version}</version>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>jul-to-slf4j</artifactId>
- <version>${slf4j.version}</version>
- </dependency>
+ <dependencyManagement>
+ <dependencies>
+ <dependency>
+ <groupId>biz.aQute</groupId>
+ <artifactId>bndlib</artifactId>
+ <version>1.50.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.scr.annotations</artifactId>
+ <version>1.12.0</version>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.13.2</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-simple</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>jcl-over-slf4j</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>jul-to-slf4j</artifactId>
+ <version>${slf4j.version}</version>
+ </dependency>
- <dependency>
- <groupId>javax.annotation</groupId>
- <artifactId>javax.annotation-api</artifactId>
- <version>1.3.2</version>
- </dependency>
- <dependency>
- <groupId>javax.xml.soap</groupId>
- <artifactId>javax.xml.soap-api</artifactId>
- <version>1.4.0</version>
- </dependency>
- <dependency>
- <groupId>org.jvnet.staxex</groupId>
- <artifactId>stax-ex</artifactId>
- <version>1.8.3</version>
- </dependency>
- </dependencies>
- </dependencyManagement>
+ <dependency>
+ <groupId>javax.annotation</groupId>
+ <artifactId>javax.annotation-api</artifactId>
+ <version>1.3.2</version>
+ </dependency>
+ <dependency>
+ <groupId>javax.xml.soap</groupId>
+ <artifactId>javax.xml.soap-api</artifactId>
+ <version>1.4.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.jvnet.staxex</groupId>
+ <artifactId>stax-ex</artifactId>
+ <version>2.0.0</version>
+ </dependency>
+ </dependencies>
+ </dependencyManagement>
- <properties>
- <maven.compiler.source>1.8</maven.compiler.source>
- <maven.compiler.target>1.8</maven.compiler.target>
- <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
- <project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
- <!-- plugin versions -->
- <forbiddenapis.version>3.1</forbiddenapis.version>
- <groovy.maven.version>2.1.1</groovy.maven.version>
- <maven.antrun.version>1.8</maven.antrun.version>
- <maven.assembly.version>3.3.0</maven.assembly.version>
- <maven.bundle.version>5.1.1</maven.bundle.version>
- <maven.failsafe.version>2.22.2</maven.failsafe.version>
- <maven.javadoc.version>3.1.1</maven.javadoc.version>
- <maven.scr.version>1.26.4</maven.scr.version>
- <maven.surefire.version>3.0.0-M4</maven.surefire.version>
- <maven.shade.version>3.2.4</maven.shade.version>
- <rat.version>0.13</rat.version>
- <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
- <poi.version>5.0.0</poi.version>
- <commons.compress.version>1.20</commons.compress.version>
- <commons.io.version>2.8.0</commons.io.version>
- <commons.lang3.version>3.11</commons.lang3.version>
- <gson.version>2.8.6</gson.version>
- <guava.version>30.1-jre</guava.version>
- <osgi.core.version>6.0.0</osgi.core.version>
+ <properties>
+ <maven.compiler.source>1.8</maven.compiler.source>
+ <maven.compiler.target>1.8</maven.compiler.target>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
+ <!-- plugin versions -->
+ <forbiddenapis.version>3.1</forbiddenapis.version>
+ <groovy.maven.version>2.1.1</groovy.maven.version>
+ <maven.antrun.version>1.8</maven.antrun.version>
+ <maven.assembly.version>3.3.0</maven.assembly.version>
+ <maven.bundle.version>5.1.1</maven.bundle.version>
+ <maven.failsafe.version>2.22.2</maven.failsafe.version>
+ <maven.javadoc.version>3.1.1</maven.javadoc.version>
+ <maven.scr.version>1.26.4</maven.scr.version>
+ <maven.surefire.version>3.0.0-M4</maven.surefire.version>
+ <maven.shade.version>3.2.4</maven.shade.version>
+ <rat.version>0.13</rat.version>
+ <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
- <poi.version>4.1.2</poi.version>
++ <poi.version>5.0.1-SNAPSHOT</poi.version>
+ <commons.compress.version>1.20</commons.compress.version>
+ <commons.io.version>2.8.0</commons.io.version>
+ <commons.lang3.version>3.12.0</commons.lang3.version>
+ <gson.version>2.8.6</gson.version>
+ <guava.version>30.1.1-jre</guava.version>
+ <osgi.core.version>6.0.0</osgi.core.version>
- <cxf.version>3.4.2</cxf.version>
+ <cxf.version>3.4.3</cxf.version>
<slf4j.version>1.7.30</slf4j.version>
- <jackson.version>2.12.1</jackson.version>
+ <log4j.version>1.2.17</log4j.version>
+ <jackson.version>2.12.2</jackson.version>
<!-- when this is next upgraded, see if we can get rid of
javax.activation dependency in tika-server -->
- <jaxb.version>2.3.3</jaxb.version>
+ <jaxb.version>3.0.0</jaxb.version>
<cli.version>1.4</cli.version>
- <lucene.version>8.7.0</lucene.version>
- <mockito.version>3.7.7</mockito.version>
+ <lucene.version>8.8.1</lucene.version>
+ <mockito.version>3.8.0</mockito.version>
<opennlp.version>1.9.3</opennlp.version>
</properties>
diff --cc tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index e2dc17e,51b36c2..ddc607a
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@@ -22,8 -22,10 +22,9 @@@ import java.io.IOException
import java.io.InputStream;
import java.util.Locale;
+ import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
-import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.ooxml.POIXMLDocument;
-import org.apache.poi.ooxml.extractor.ExtractorFactory;
+import org.apache.poi.ooxml.extractor.POIXMLExtractorFactory;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.InvalidOperationException;
@@@ -113,10 -112,10 +116,10 @@@ public class OOXMLExtractorFactory
true, false)) {
try {
pkg = OPCPackage.open(rereadableInputStream);
- } catch (EOFException e) {
- } catch (EOFException|UnsupportedZipFeatureException e) {
++ } catch (EOFException| UnsupportedZipFeatureException e) {
rereadableInputStream.rewind();
tmpRepairedCopy = File.createTempFile("tika-ooxml-repair-", "");
- ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy);
+ ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false);
//if there isn't enough left to be opened as a package
//throw an exception -- we may want to fall back to streaming
//parsing