You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/13 15:58:31 UTC
[tika] branch main updated: TIKA-3475 -- downgrade mime4j and
adjust exception handling change in commons-compress
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new d71d4ab TIKA-3475 -- downgrade mime4j and adjust exception handling change in commons-compress
d71d4ab is described below
commit d71d4abfdc911b4f5227b0afe22c30d814bb15ec
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jul 13 11:58:10 2021 -0400
TIKA-3475 -- downgrade mime4j and adjust exception handling change in commons-compress
---
tika-parent/pom.xml | 3 ++-
.../parser/microsoft/ooxml/OOXMLExtractorFactory.java | 16 ++++++++++++----
.../java/org/apache/tika/parser/epub/EpubParser.java | 3 +--
3 files changed, 15 insertions(+), 7 deletions(-)
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 40c4cec..cec512b 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -331,7 +331,8 @@
<lucene.version>8.9.0</lucene.version>
<metadata.extractor.version>2.16.0</metadata.extractor.version>
<microsoft.translator.version>0.6.2</microsoft.translator.version>
- <mime4j.version>0.8.5</mime4j.version>
+ <!-- 0.8.5 is built with java 11 and does not work with Java 8 -->
+ <mime4j.version>0.8.4</mime4j.version>
<mockito.version>3.11.2</mockito.version>
<netcdf-java.version>4.5.5</netcdf-java.version>
<opencsv.version>2.3</opencsv.version>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 03a1b1a..c1fa9f0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -108,7 +108,11 @@ public class OOXMLExtractorFactory {
MAX_BUFFER_LENGTH, false)) {
try {
pkg = OPCPackage.open(new CloseShieldInputStream(rereadableInputStream));
- } catch (EOFException e) {
+ } catch (UnsupportedZipFeatureException e) {
+ if (e.getFeature() !=
+ UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+ throw e;
+ }
rereadableInputStream.rewind();
tmpRepairedCopy = File.createTempFile("tika-ooxml-repair-", "");
ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false);
@@ -116,9 +120,13 @@ public class OOXMLExtractorFactory {
//throw an exception -- we may want to fall back to streaming
//parsing
pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
- } catch (UnsupportedZipFeatureException e) {
- if (e.getFeature() !=
- UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+ } catch (IOException e) {
+ if (e instanceof EOFException) {
+ //keep going
+ } else if (e instanceof IOException && e.getMessage() != null &&
+ e.getMessage().contains("Truncated")) {
+ //keep going
+ } else {
throw e;
}
rereadableInputStream.rewind();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index ce8228b..dc96a4c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -33,7 +33,6 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
-import java.util.zip.ZipException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
@@ -182,7 +181,7 @@ public class EpubParser extends AbstractParser {
ZipFile zipFile = null;
try {
zipFile = new ZipFile(tis.getPath().toFile());
- } catch (ZipException e) {
+ } catch (IOException e) {
ParserUtils.recordParserFailure(this, e, metadata);
trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, context);
return;