You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/02/11 14:58:12 UTC

[tika] branch branch_1x updated: TIKA-3678 -- clean up poi 5.x's complaints about closing opcpackage instead of reverting, and turn off most of POI's logging in tika-parsers tests

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 03b572b  TIKA-3678 -- clean up poi 5.x's complaints about closing opcpackage instead of reverting, and turn off most of POI's logging in tika-parsers tests
03b572b is described below

commit 03b572b8919579d0a5df3403bfe60800d470dd6c
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 11 09:57:55 2022 -0500

    TIKA-3678 -- clean up poi 5.x's complaints about closing opcpackage instead of reverting, and turn off most of POI's logging in tika-parsers tests
---
 .../microsoft/ooxml/OOXMLExtractorFactory.java     | 12 ++----
 .../parser/microsoft/ooxml/OPCPackageWrapper.java  | 46 ++++++++++++++++++++++
 .../tika/parser/pkg/ZipContainerDetector.java      | 12 ++++--
 tika-parsers/src/test/resources/log4j2.xml         |  8 ++++
 4 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 34aeae1..88c0f59 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -105,8 +105,8 @@ public class OOXMLExtractorFactory {
 
             // Locate or Open the OPCPackage for the file
             TikaInputStream tis = TikaInputStream.cast(stream);
-            if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
-                pkg = (OPCPackage) tis.getOpenContainer();
+            if (tis != null && tis.getOpenContainer() instanceof OPCPackageWrapper) {
+                pkg = ((OPCPackageWrapper)tis.getOpenContainer()).getOPCPackage();
             } else if (tis != null && tis.hasFile()) {
                 try {
                     pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
@@ -115,7 +115,7 @@ public class OOXMLExtractorFactory {
                     ZipSalvager.salvageCopy(tis.getFile(), tmpRepairedCopy);
                     pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
                 }
-                tis.setOpenContainer(pkg);
+                tis.setOpenContainer(new OPCPackageWrapper(pkg));
             } else {
                 //OPCPackage slurps rris into memory so we can close rris
                 //without apparent problems
@@ -263,11 +263,7 @@ public class OOXMLExtractorFactory {
         } finally {
             if (tmpRepairedCopy != null) {
                 if (pkg != null) {
-                    try {
-                        pkg.close();
-                    } catch (IOException e) {
-                        LOG.warn("problem closing pkg file");
-                    }
+                    pkg.revert();
                 }
                 boolean deleted = tmpRepairedCopy.delete();
                 if (! deleted) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
new file mode 100644
index 0000000..2cfd24f
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.poi.openxml4j.opc.OPCPackage;
+
+/**
+ * This is a wrapper around OPCPackage that calls revert() instead of close().
+ * We added this during the upgrade of POI to 5.x to avoid a warning.
+ *
+ * TIKA-3663
+ */
+public class OPCPackageWrapper implements Closeable {
+
+    private final OPCPackage opcPackage;
+
+    public OPCPackageWrapper(OPCPackage opcPackage) {
+        this.opcPackage = opcPackage;
+    }
+
+    @Override
+    public void close() throws IOException {
+        opcPackage.revert();
+    }
+
+    public OPCPackage getOPCPackage() {
+        return opcPackage;
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 0688230..ac81d06 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -54,6 +54,7 @@ import org.apache.tika.parser.iwork.IWorkPackageParser;
 import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
 import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
 import org.apache.tika.parser.iwork.iwana.IWork18PackageParser;
+import org.apache.tika.parser.microsoft.ooxml.OPCPackageWrapper;
 
 /**
  * A detector that works on Zip documents and other archive and compression
@@ -281,12 +282,17 @@ public class ZipContainerDetector implements Detector {
         //as of 4.x, POI throws an exception for non-POI OPC file types
         //unless we change POI, we can't rely on POI for non-POI files
         ZipEntrySource zipEntrySource = new ZipFileZipEntrySource(zipFile);
-
+        MediaType type = null;
+        //in POI 5.2.0, if there's an exception during detection,
+        //close() is called on the pkg instead of revert(), which leads
+        //to needless logging. Do a preliminary test for OPC now.
+        if (zipEntrySource.getEntry("[Content_Types].xml") == null) {
+            return type;
+        }
         // Use POI to open and investigate it for us
         //Unfortunately, POI can throw a RuntimeException...so we
         //have to catch that.
         OPCPackage pkg = null;
-        MediaType type = null;
         try {
             pkg = OPCPackage.open(zipEntrySource);
             type = detectOfficeOpenXML(pkg);
@@ -301,7 +307,7 @@ public class ZipContainerDetector implements Detector {
             return null;
         }
         //only set the open container if we made it here
-        stream.setOpenContainer(pkg);
+        stream.setOpenContainer(new OPCPackageWrapper(pkg));
         return type;
     }
 
diff --git a/tika-parsers/src/test/resources/log4j2.xml b/tika-parsers/src/test/resources/log4j2.xml
index c88e66e..e6442dd 100644
--- a/tika-parsers/src/test/resources/log4j2.xml
+++ b/tika-parsers/src/test/resources/log4j2.xml
@@ -28,5 +28,13 @@
     <Root level="info">
       <AppenderRef ref="Console"/>
     </Root>
+    <!-- effectively turn off the logging for POI 5.x but leave the xmlhelper
+     to warn-->
+    <Logger name="org.apache.poi.util.XMLHelper" level="INFO" additivity="false">
+      <AppenderRef ref="Console"/>
+    </Logger>
+    <Logger name="org.apache.poi" level="ERROR" additivity="false">
+      <AppenderRef ref="Console"/>
+    </Logger>
   </Loggers>
 </Configuration>
\ No newline at end of file