You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/13 15:58:31 UTC

[tika] branch main updated: TIKA-3475 -- downgrade mime4j and adjust exception handling change in commons-compress

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new d71d4ab  TIKA-3475 -- downgrade mime4j and adjust exception handling change in commons-compress
d71d4ab is described below

commit d71d4abfdc911b4f5227b0afe22c30d814bb15ec
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jul 13 11:58:10 2021 -0400

    TIKA-3475 -- downgrade mime4j and adjust exception handling change in commons-compress
---
 tika-parent/pom.xml                                      |  3 ++-
 .../parser/microsoft/ooxml/OOXMLExtractorFactory.java    | 16 ++++++++++++----
 .../java/org/apache/tika/parser/epub/EpubParser.java     |  3 +--
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 40c4cec..cec512b 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -331,7 +331,8 @@
     <lucene.version>8.9.0</lucene.version>
     <metadata.extractor.version>2.16.0</metadata.extractor.version>
     <microsoft.translator.version>0.6.2</microsoft.translator.version>
-    <mime4j.version>0.8.5</mime4j.version>
+    <!-- 0.8.5 is built with java 11 and does not work with Java 8 -->
+    <mime4j.version>0.8.4</mime4j.version>
     <mockito.version>3.11.2</mockito.version>
     <netcdf-java.version>4.5.5</netcdf-java.version>
     <opencsv.version>2.3</opencsv.version>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 03a1b1a..c1fa9f0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -108,7 +108,11 @@ public class OOXMLExtractorFactory {
                         MAX_BUFFER_LENGTH, false)) {
                     try {
                         pkg = OPCPackage.open(new CloseShieldInputStream(rereadableInputStream));
-                    } catch (EOFException e) {
+                    } catch (UnsupportedZipFeatureException e) {
+                        if (e.getFeature() !=
+                                UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+                            throw e;
+                        }
                         rereadableInputStream.rewind();
                         tmpRepairedCopy = File.createTempFile("tika-ooxml-repair-", "");
                         ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false);
@@ -116,9 +120,13 @@ public class OOXMLExtractorFactory {
                         //throw an exception -- we may want to fall back to streaming
                         //parsing
                         pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
-                    } catch (UnsupportedZipFeatureException e) {
-                        if (e.getFeature() !=
-                                UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+                    } catch (IOException e) {
+                        if (e instanceof EOFException) {
+                            //keep going
+                        } else if (e instanceof IOException && e.getMessage() != null &&
+                                e.getMessage().contains("Truncated")) {
+                            //keep going
+                        } else {
                             throw e;
                         }
                         rereadableInputStream.rewind();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index ce8228b..dc96a4c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -33,7 +33,6 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
-import java.util.zip.ZipException;
 
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
@@ -182,7 +181,7 @@ public class EpubParser extends AbstractParser {
         ZipFile zipFile = null;
         try {
             zipFile = new ZipFile(tis.getPath().toFile());
-        } catch (ZipException e) {
+        } catch (IOException e) {
             ParserUtils.recordParserFailure(this, e, metadata);
             trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, context);
             return;