You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/02/11 14:58:12 UTC
[tika] branch branch_1x updated: TIKA-3678 -- clean up poi 5.x's complaints about closing opcpackage instead of reverting, and turn off most of POI's logging in tika-parsers tests
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 03b572b TIKA-3678 -- clean up poi 5.x's complaints about closing opcpackage instead of reverting, and turn off most of POI's logging in tika-parsers tests
03b572b is described below
commit 03b572b8919579d0a5df3403bfe60800d470dd6c
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 11 09:57:55 2022 -0500
TIKA-3678 -- clean up poi 5.x's complaints about closing opcpackage instead of reverting, and turn off most of POI's logging in tika-parsers tests
---
.../microsoft/ooxml/OOXMLExtractorFactory.java | 12 ++----
.../parser/microsoft/ooxml/OPCPackageWrapper.java | 46 ++++++++++++++++++++++
.../tika/parser/pkg/ZipContainerDetector.java | 12 ++++--
tika-parsers/src/test/resources/log4j2.xml | 8 ++++
4 files changed, 67 insertions(+), 11 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 34aeae1..88c0f59 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -105,8 +105,8 @@ public class OOXMLExtractorFactory {
// Locate or Open the OPCPackage for the file
TikaInputStream tis = TikaInputStream.cast(stream);
- if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
- pkg = (OPCPackage) tis.getOpenContainer();
+ if (tis != null && tis.getOpenContainer() instanceof OPCPackageWrapper) {
+ pkg = ((OPCPackageWrapper)tis.getOpenContainer()).getOPCPackage();
} else if (tis != null && tis.hasFile()) {
try {
pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
@@ -115,7 +115,7 @@ public class OOXMLExtractorFactory {
ZipSalvager.salvageCopy(tis.getFile(), tmpRepairedCopy);
pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
}
- tis.setOpenContainer(pkg);
+ tis.setOpenContainer(new OPCPackageWrapper(pkg));
} else {
//OPCPackage slurps rris into memory so we can close rris
//without apparent problems
@@ -263,11 +263,7 @@ public class OOXMLExtractorFactory {
} finally {
if (tmpRepairedCopy != null) {
if (pkg != null) {
- try {
- pkg.close();
- } catch (IOException e) {
- LOG.warn("problem closing pkg file");
- }
+ pkg.revert();
}
boolean deleted = tmpRepairedCopy.delete();
if (! deleted) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
new file mode 100644
index 0000000..2cfd24f
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.poi.openxml4j.opc.OPCPackage;
+
+/**
+ * This is a wrapper around OPCPackage that calls revert() instead of close().
+ * We added this during the upgrade of POI to 5.x to avoid a warning.
+ *
+ * TIKA-3663
+ */
+public class OPCPackageWrapper implements Closeable {
+
+ private final OPCPackage opcPackage;
+
+ public OPCPackageWrapper(OPCPackage opcPackage) {
+ this.opcPackage = opcPackage;
+ }
+
+ @Override
+ public void close() throws IOException {
+ opcPackage.revert();
+ }
+
+ public OPCPackage getOPCPackage() {
+ return opcPackage;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 0688230..ac81d06 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -54,6 +54,7 @@ import org.apache.tika.parser.iwork.IWorkPackageParser;
import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
import org.apache.tika.parser.iwork.iwana.IWork18PackageParser;
+import org.apache.tika.parser.microsoft.ooxml.OPCPackageWrapper;
/**
* A detector that works on Zip documents and other archive and compression
@@ -281,12 +282,17 @@ public class ZipContainerDetector implements Detector {
//as of 4.x, POI throws an exception for non-POI OPC file types
//unless we change POI, we can't rely on POI for non-POI files
ZipEntrySource zipEntrySource = new ZipFileZipEntrySource(zipFile);
-
+ MediaType type = null;
+ //in POI 5.2.0, if there's an exception during detection,
+ //close() is called on the pkg instead of revert(), which leads
+ //to needless logging. Do a preliminary test for OPC now.
+ if (zipEntrySource.getEntry("[Content_Types].xml") == null) {
+ return type;
+ }
// Use POI to open and investigate it for us
//Unfortunately, POI can throw a RuntimeException...so we
//have to catch that.
OPCPackage pkg = null;
- MediaType type = null;
try {
pkg = OPCPackage.open(zipEntrySource);
type = detectOfficeOpenXML(pkg);
@@ -301,7 +307,7 @@ public class ZipContainerDetector implements Detector {
return null;
}
//only set the open container if we made it here
- stream.setOpenContainer(pkg);
+ stream.setOpenContainer(new OPCPackageWrapper(pkg));
return type;
}
diff --git a/tika-parsers/src/test/resources/log4j2.xml b/tika-parsers/src/test/resources/log4j2.xml
index c88e66e..e6442dd 100644
--- a/tika-parsers/src/test/resources/log4j2.xml
+++ b/tika-parsers/src/test/resources/log4j2.xml
@@ -28,5 +28,13 @@
<Root level="info">
<AppenderRef ref="Console"/>
</Root>
+ <!-- effectively turn off the logging for POI 5.x but leave the xmlhelper
+ to warn-->
+ <Logger name="org.apache.poi.util.XMLHelper" level="INFO" additivity="false">
+ <AppenderRef ref="Console"/>
+ </Logger>
+ <Logger name="org.apache.poi" level="ERROR" additivity="false">
+ <AppenderRef ref="Console"/>
+ </Logger>
</Loggers>
</Configuration>
\ No newline at end of file