You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/14 01:54:42 UTC
[tika] 03/03: TIKA-3316 -- improve XPS parser to include open XPS
and allow for streaming zips with data descriptors
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit cba0372821022833a9c976bd47bd67193f73f635
Author: tallison <ta...@apache.org>
AuthorDate: Sat Mar 13 20:54:16 2021 -0500
TIKA-3316 -- improve XPS parser to include open XPS and allow for streaming zips with data descriptors
---
.../detect/microsoft/ooxml/OPCPackageDetector.java | 8 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 14 ++-
.../microsoft/ooxml/xps/XPSExtractorDecorator.java | 5 ++
.../parser/microsoft/ooxml/xps/XPSParserTest.java | 45 +++++++++-
.../test-documents/testXPSWithDataDescriptor.xps | Bin 0 -> 44523 bytes
.../test-documents/testXPSWithDataDescriptor2.xps | Bin 0 -> 51175 bytes
.../detect/zip/DefaultZipContainerDetector.java | 38 +++++++--
.../org/apache/tika/zip/utils/ZipSalvager.java | 95 +++++++++++++--------
8 files changed, 159 insertions(+), 46 deletions(-)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
index 03dbda5..2dd9cf4 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
@@ -97,7 +97,6 @@ public class OPCPackageDetector implements ZipContainerDetector {
static final MediaType XPS =
MediaType.application("vnd.ms-xpsdocument");
-
static final Set<String> OOXML_HINTS = fillSet(
"word/document.xml",
"_rels/.rels",
@@ -156,6 +155,9 @@ public class OPCPackageDetector implements ZipContainerDetector {
private static final String XPS_DOCUMENT =
"http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
+ private static final String OPEN_XPS_DOCUMENT =
+ "http://schemas.openxps.org/oxps/v1.0/fixedrepresentation";
+
private static final String STAR_OFFICE_6_WRITER = "application/vnd.sun.xml.writer";
@@ -210,6 +212,10 @@ public class OPCPackageDetector implements ZipContainerDetector {
if (core.size() == 1) {
return MediaType.application("vnd.ms-xpsdocument");
}
+ core = pkg.getRelationshipsByType(OPEN_XPS_DOCUMENT);
+ if (core.size() == 1) {
+ return MediaType.application("vnd.ms-xpsdocument");
+ }
}
if (core.size() == 0) {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 8aff0e8..c135e8c 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.poi.ooxml.POIXMLDocument;
import org.apache.poi.ooxml.extractor.ExtractorFactory;
import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
@@ -111,7 +112,18 @@ public class OOXMLExtractorFactory {
} catch (EOFException e) {
rereadableInputStream.rewind();
tmpRepairedCopy = File.createTempFile("tika-ooxml-repair-", "");
- ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy);
+ ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false);
+ //if there isn't enough left to be opened as a package
+ //throw an exception -- we may want to fall back to streaming
+ //parsing
+ pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
+ } catch (UnsupportedZipFeatureException e) {
+ if (e.getFeature() != UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+ throw e;
+ }
+ rereadableInputStream.rewind();
+ tmpRepairedCopy = File.createTempFile("tika-ooxml-repair-", "");
+ ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false);
//if there isn't enough left to be opened as a package
//throw an exception -- we may want to fall back to streaming
//parsing
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
index 2643a3a..5cf7573 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
@@ -51,6 +51,8 @@ import java.util.Map;
public class XPSExtractorDecorator extends AbstractOOXMLExtractor {
private static String XPS_DOCUMENT = "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
+ private static final String OPEN_XPS_DOCUMENT =
+ "http://schemas.openxps.org/oxps/v1.0/fixedrepresentation";
private final ParseContext context;
private final ZipPackage pkg;
@@ -76,6 +78,9 @@ public class XPSExtractorDecorator extends AbstractOOXMLExtractor {
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
PackageRelationshipCollection prc = pkg.getRelationshipsByType(XPS_DOCUMENT);
+ if (prc.size() == 0) {
+ prc = pkg.getRelationshipsByType(OPEN_XPS_DOCUMENT);
+ }
for (int i = 0; i < prc.size(); i++) {
PackageRelationship pr = prc.getRelationship(i);
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
index 6015f6a..9c3ae65 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
@@ -19,9 +19,14 @@ package org.apache.tika.parser.microsoft.ooxml.xps;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.junit.Test;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.List;
import static org.junit.Assert.assertEquals;
@@ -94,4 +99,42 @@ public class XPSParserTest extends TikaTest {
}
+ @Test
+ public void testXPSWithDataDescriptor() throws Exception {
+ Path path = Paths.get(
+ XPSParserTest.class.getResource("/test-documents/testXPSWithDataDescriptor.xps").toURI());
+ //test both path and stream based
+ List<Metadata> metadataList = getRecursiveMetadata(path, true);
+ assertEquals(2, metadataList.size());
+ assertContains("This is my XPS document test",
+ metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ Files.copy(path, bos);
+ metadataList = getRecursiveMetadata(new ByteArrayInputStream(bos.toByteArray()), true);
+ assertEquals(2, metadataList.size());
+ assertContains("This is my XPS document test",
+ metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+
+ assertEquals(TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString(),
+ metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ }
+
+ @Test
+ public void testOpenXPSWithDataDescriptor() throws Exception {
+ Path path = Paths.get(
+ XPSParserTest.class.getResource("/test-documents/testXPSWithDataDescriptor2.xps").toURI());
+ List<Metadata> metadataList = getRecursiveMetadata(path, true);
+ assertEquals(2, metadataList.size());
+ assertContains("How was I supposed to know",
+ metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ Files.copy(path, bos);
+ metadataList = getRecursiveMetadata(new ByteArrayInputStream(bos.toByteArray()), true);
+ assertEquals(2, metadataList.size());
+ assertContains("How was I supposed to know",
+ metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+ }
+
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor.xps b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor.xps
new file mode 100644
index 0000000..1569377
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor.xps differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps
new file mode 100644
index 0000000..efc4a0e
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
index 8a3d1f3..d3ed8a6 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
@@ -18,6 +18,7 @@ package org.apache.tika.detect.zip;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
@@ -28,12 +29,14 @@ import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.config.Field;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.detect.Detector;
+import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.LookaheadInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import java.io.ByteArrayInputStream;
+import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
@@ -83,6 +86,7 @@ public class DefaultZipContainerDetector implements Detector {
*
* @param markLimit mark limit for streaming detection
*/
+ @Field
public void setMarkLimit(int markLimit) {
this.markLimit = markLimit;
}
@@ -118,10 +122,7 @@ public class DefaultZipContainerDetector implements Detector {
return detectZipFormatOnFile(tis);
}
}
-
- try (LookaheadInputStream lookahead = new LookaheadInputStream(input, markLimit)) {
- return detectStreaming(lookahead, metadata);
- }
+ return detectStreaming(input, metadata);
} else if (!type.equals(MediaType.OCTET_STREAM)) {
return type;
} else {
@@ -207,10 +208,21 @@ public class DefaultZipContainerDetector implements Detector {
}
MediaType detectStreaming(InputStream input, Metadata metadata) throws IOException {
+ BoundedInputStream boundedInputStream = new BoundedInputStream(markLimit, input);
+ boundedInputStream.mark(markLimit);
+ try {
+ return detectStreaming(boundedInputStream, metadata, false);
+ } finally {
+ boundedInputStream.reset();
+ }
+ }
+
+ MediaType detectStreaming (InputStream input,
+ Metadata metadata, boolean allowStoredEntries) throws IOException {
StreamingDetectContext detectContext = new StreamingDetectContext();
- try (
- ZipArchiveInputStream zis =
- new ZipArchiveInputStream(new CloseShieldInputStream(input))) {
+ try (ZipArchiveInputStream zis =
+ new ZipArchiveInputStream(new CloseShieldInputStream(input),
+ "UTF8", false, allowStoredEntries)) {
ZipArchiveEntry zae = zis.getNextZipEntry();
while (zae != null) {
MediaType mt = detect(zae, zis, detectContext);
@@ -219,10 +231,18 @@ public class DefaultZipContainerDetector implements Detector {
}
zae = zis.getNextZipEntry();
}
+ } catch (UnsupportedZipFeatureException zfe) {
+ if (allowStoredEntries == false &&
+ zfe.getFeature() == UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+ input.reset();
+ return detectStreaming(input, metadata, true);
+ }
} catch (SecurityException e) {
throw e;
- } catch (Exception e) {
- //swallow
+ } catch (EOFException e) {
+ //truncated zip -- swallow
+ } catch (IOException e) {
+ //another option for a truncated zip
}
return finalDetect(detectContext);
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java
index 00c6b5b..1000d5f 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java
@@ -23,10 +23,12 @@ import java.io.InputStream;
import java.nio.file.Files;
import java.util.zip.ZipException;
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
import org.apache.commons.io.IOUtils;
+import org.apache.tika.utils.RereadableInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -44,51 +46,76 @@ public class ZipSalvager {
* @param brokenZip
* @param salvagedZip
*/
- public static void salvageCopy(InputStream brokenZip, File salvagedZip) {
- try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(salvagedZip)) {
- ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(brokenZip);
- ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
- while (zae != null) {
+ public static void salvageCopy(InputStream brokenZip, File salvagedZip, boolean allowStoredEntries) throws IOException {
+ if (!(brokenZip instanceof RereadableInputStream)) {
+ brokenZip = new RereadableInputStream(brokenZip, 50000,
+ true, false);
+ }
+ try {
+ try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(salvagedZip);
+ ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(brokenZip,
+ "UTF8", false, allowStoredEntries)) {
+ ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
try {
- if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) {
- //create a new ZAE and copy over only the name so that
- //if there is bad info (e.g. CRC) in brokenZip's zae, that
- //won't be propagated or cause an exception
- outputStream.putArchiveEntry(new ZipArchiveEntry(zae.getName()));
- //this will copy an incomplete stream...so there
- //could be truncation of the xml/contents, but the zip file
- //should be intact.
- boolean successfullyCopied = false;
- try {
- IOUtils.copy(zipArchiveInputStream, outputStream);
- successfullyCopied = true;
- } catch (IOException e) {
- //this can hit a "truncated ZipFile" IOException
- }
- outputStream.flush();
- outputStream.closeArchiveEntry();
- if (!successfullyCopied) {
- break;
- }
+ processZAE(zae, zipArchiveInputStream, outputStream);
+ } catch (UnsupportedZipFeatureException uzfe) {
+ if (uzfe.getFeature() ==
+ UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+ //percolate up to allow for retry
+ throw uzfe;
}
- zae = zipArchiveInputStream.getNextZipEntry();
+ //else swallow
} catch (ZipException | EOFException e) {
- break;
+ //swallow
}
-
+ outputStream.flush();
+ outputStream.finish();
+ } catch (UnsupportedZipFeatureException e) {
+ //percolate up to allow for retry
+ throw e;
+ } catch (IOException e) {
+ LOG.warn("problem fixing zip", e);
}
- outputStream.flush();
- outputStream.finish();
-
+ } catch (UnsupportedZipFeatureException e) {
+ //now retry
+ if (allowStoredEntries == false) {
+ ((RereadableInputStream) brokenZip).rewind();
+ salvageCopy(brokenZip, salvagedZip, true);
+ }
+ }
+ }
- } catch (IOException e) {
- LOG.warn("problem fixing zip", e);
+ private static void processZAE(ZipArchiveEntry zae, ZipArchiveInputStream zipArchiveInputStream,
+ ZipArchiveOutputStream outputStream) throws IOException {
+ while (zae != null) {
+ if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) {
+ //create a new ZAE and copy over only the name so that
+ //if there is bad info (e.g. CRC) in brokenZip's zae, that
+ //won't be propagated or cause an exception
+ outputStream.putArchiveEntry(new ZipArchiveEntry(zae.getName()));
+ //this will copy an incomplete stream...so there
+ //could be truncation of the xml/contents, but the zip file
+ //should be intact.
+ boolean successfullyCopied = false;
+ try {
+ IOUtils.copy(zipArchiveInputStream, outputStream);
+ successfullyCopied = true;
+ } catch (IOException e) {
+ //this can hit a "truncated ZipFile" IOException
+ }
+ outputStream.flush();
+ outputStream.closeArchiveEntry();
+ if (!successfullyCopied) {
+ break;
+ }
+ }
+ zae = zipArchiveInputStream.getNextZipEntry();
}
}
public static void salvageCopy(File brokenZip, File salvagedZip) throws IOException {
try (InputStream is = Files.newInputStream(brokenZip.toPath())) {
- salvageCopy(is, salvagedZip);
+ salvageCopy(is, salvagedZip, false);
}
}
}