You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/14 01:54:42 UTC

[tika] 03/03: TIKA-3316 -- improve XPS parser to include open XPS and allow for streaming zips with data descriptors

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit cba0372821022833a9c976bd47bd67193f73f635
Author: tallison <ta...@apache.org>
AuthorDate: Sat Mar 13 20:54:16 2021 -0500

    TIKA-3316 -- improve XPS parser to include open XPS and allow for streaming zips with data descriptors
---
 .../detect/microsoft/ooxml/OPCPackageDetector.java |   8 +-
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |  14 ++-
 .../microsoft/ooxml/xps/XPSExtractorDecorator.java |   5 ++
 .../parser/microsoft/ooxml/xps/XPSParserTest.java  |  45 +++++++++-
 .../test-documents/testXPSWithDataDescriptor.xps   | Bin 0 -> 44523 bytes
 .../test-documents/testXPSWithDataDescriptor2.xps  | Bin 0 -> 51175 bytes
 .../detect/zip/DefaultZipContainerDetector.java    |  38 +++++++--
 .../org/apache/tika/zip/utils/ZipSalvager.java     |  95 +++++++++++++--------
 8 files changed, 159 insertions(+), 46 deletions(-)

diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
index 03dbda5..2dd9cf4 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/ooxml/OPCPackageDetector.java
@@ -97,7 +97,6 @@ public class OPCPackageDetector implements ZipContainerDetector {
     static final MediaType XPS =
             MediaType.application("vnd.ms-xpsdocument");
 
-
     static final Set<String> OOXML_HINTS = fillSet(
             "word/document.xml",
             "_rels/.rels",
@@ -156,6 +155,9 @@ public class OPCPackageDetector implements ZipContainerDetector {
     private static final String XPS_DOCUMENT =
             "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
 
+    private static final String OPEN_XPS_DOCUMENT =
+            "http://schemas.openxps.org/oxps/v1.0/fixedrepresentation";
+
     private static final String STAR_OFFICE_6_WRITER = "application/vnd.sun.xml.writer";
 
 
@@ -210,6 +212,10 @@ public class OPCPackageDetector implements ZipContainerDetector {
             if (core.size() == 1) {
                 return MediaType.application("vnd.ms-xpsdocument");
             }
+            core = pkg.getRelationshipsByType(OPEN_XPS_DOCUMENT);
+            if (core.size() == 1) {
+                return MediaType.application("vnd.ms-xpsdocument");
+            }
         }
 
         if (core.size() == 0) {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 8aff0e8..c135e8c 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.util.Locale;
 
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 import org.apache.poi.ooxml.POIXMLDocument;
 import org.apache.poi.ooxml.extractor.ExtractorFactory;
 import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
@@ -111,7 +112,18 @@ public class OOXMLExtractorFactory {
                     } catch (EOFException e) {
                         rereadableInputStream.rewind();
                         tmpRepairedCopy = File.createTempFile("tika-ooxml-repair-", "");
-                        ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy);
+                        ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false);
+                        //if there isn't enough left to be opened as a package
+                        //throw an exception -- we may want to fall back to streaming
+                        //parsing
+                        pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
+                    } catch (UnsupportedZipFeatureException e) {
+                        if (e.getFeature() != UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+                            throw e;
+                        }
+                        rereadableInputStream.rewind();
+                        tmpRepairedCopy = File.createTempFile("tika-ooxml-repair-", "");
+                        ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy, false);
                         //if there isn't enough left to be opened as a package
                         //throw an exception -- we may want to fall back to streaming
                         //parsing
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
index 2643a3a..5cf7573 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java
@@ -51,6 +51,8 @@ import java.util.Map;
 public class XPSExtractorDecorator extends AbstractOOXMLExtractor {
 
     private static String XPS_DOCUMENT = "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
+    private static final String OPEN_XPS_DOCUMENT =
+            "http://schemas.openxps.org/oxps/v1.0/fixedrepresentation";
 
     private final ParseContext context;
     private final ZipPackage pkg;
@@ -76,6 +78,9 @@ public class XPSExtractorDecorator extends AbstractOOXMLExtractor {
     protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
 
         PackageRelationshipCollection prc = pkg.getRelationshipsByType(XPS_DOCUMENT);
+        if (prc.size() == 0) {
+            prc = pkg.getRelationshipsByType(OPEN_XPS_DOCUMENT);
+        }
         for (int i = 0; i < prc.size(); i++) {
             PackageRelationship pr = prc.getRelationship(i);
 
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
index 6015f6a..9c3ae65 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java
@@ -19,9 +19,14 @@ package org.apache.tika.parser.microsoft.ooxml.xps;
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.junit.Test;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.List;
 
 import static org.junit.Assert.assertEquals;
@@ -94,4 +99,42 @@ public class XPSParserTest extends TikaTest {
 
     }
 
+    @Test
+    public void testXPSWithDataDescriptor() throws Exception {
+        Path path = Paths.get(
+                XPSParserTest.class.getResource("/test-documents/testXPSWithDataDescriptor.xps").toURI());
+        //test both path and stream based
+        List<Metadata> metadataList = getRecursiveMetadata(path, true);
+        assertEquals(2, metadataList.size());
+        assertContains("This is my XPS document test",
+                metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        Files.copy(path, bos);
+        metadataList = getRecursiveMetadata(new ByteArrayInputStream(bos.toByteArray()), true);
+        assertEquals(2, metadataList.size());
+        assertContains("This is my XPS document test",
+                metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString(),
+                metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+    }
+
+    @Test
+    public void testOpenXPSWithDataDescriptor() throws Exception {
+        Path path = Paths.get(
+                XPSParserTest.class.getResource("/test-documents/testXPSWithDataDescriptor2.xps").toURI());
+        List<Metadata> metadataList = getRecursiveMetadata(path, true);
+        assertEquals(2, metadataList.size());
+        assertContains("How was I supposed to know",
+                metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        Files.copy(path, bos);
+        metadataList = getRecursiveMetadata(new ByteArrayInputStream(bos.toByteArray()), true);
+        assertEquals(2, metadataList.size());
+        assertContains("How was I supposed to know",
+                metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+    }
+
 }
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor.xps b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor.xps
new file mode 100644
index 0000000..1569377
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor.xps differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps
new file mode 100644
index 0000000..efc4a0e
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXPSWithDataDescriptor2.xps differ
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
index 8a3d1f3..d3ed8a6 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java
@@ -18,6 +18,7 @@ package org.apache.tika.detect.zip;
 
 import org.apache.commons.compress.archivers.ArchiveException;
 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipFile;
@@ -28,12 +29,14 @@ import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.config.Field;
 import org.apache.tika.config.ServiceLoader;
 import org.apache.tika.detect.Detector;
+import org.apache.tika.io.BoundedInputStream;
 import org.apache.tika.io.LookaheadInputStream;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 
 import java.io.ByteArrayInputStream;
+import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.List;
@@ -83,6 +86,7 @@ public class DefaultZipContainerDetector implements Detector {
      *
      * @param markLimit mark limit for streaming detection
      */
+    @Field
     public void setMarkLimit(int markLimit) {
         this.markLimit = markLimit;
     }
@@ -118,10 +122,7 @@ public class DefaultZipContainerDetector implements Detector {
                     return detectZipFormatOnFile(tis);
                 }
             }
-
-            try (LookaheadInputStream lookahead = new LookaheadInputStream(input, markLimit)) {
-                return detectStreaming(lookahead, metadata);
-            }
+            return detectStreaming(input, metadata);
         } else if (!type.equals(MediaType.OCTET_STREAM)) {
             return type;
         } else {
@@ -207,10 +208,21 @@ public class DefaultZipContainerDetector implements Detector {
     }
 
     MediaType detectStreaming(InputStream input, Metadata metadata) throws IOException {
+        BoundedInputStream boundedInputStream = new BoundedInputStream(markLimit, input);
+        boundedInputStream.mark(markLimit);
+        try {
+            return detectStreaming(boundedInputStream, metadata, false);
+        } finally {
+            boundedInputStream.reset();
+        }
+    }
+
+    MediaType detectStreaming (InputStream input,
+                               Metadata metadata, boolean allowStoredEntries) throws IOException {
         StreamingDetectContext detectContext = new StreamingDetectContext();
-        try (
-                ZipArchiveInputStream zis =
-                        new ZipArchiveInputStream(new CloseShieldInputStream(input))) {
+        try (ZipArchiveInputStream zis =
+                     new ZipArchiveInputStream(new CloseShieldInputStream(input),
+                             "UTF8", false, allowStoredEntries)) {
             ZipArchiveEntry zae = zis.getNextZipEntry();
             while (zae != null) {
                 MediaType mt = detect(zae, zis, detectContext);
@@ -219,10 +231,18 @@ public class DefaultZipContainerDetector implements Detector {
                 }
                 zae = zis.getNextZipEntry();
             }
+        } catch (UnsupportedZipFeatureException zfe) {
+            if (allowStoredEntries == false &&
+                    zfe.getFeature() == UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+                input.reset();
+                return detectStreaming(input, metadata, true);
+            }
         } catch (SecurityException e) {
             throw e;
-        } catch (Exception e) {
-            //swallow
+        } catch (EOFException e) {
+            //truncated zip -- swallow
+        } catch (IOException e) {
+            //another option for a truncated zip
         }
 
         return finalDetect(detectContext);
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java
index 00c6b5b..1000d5f 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/zip/utils/ZipSalvager.java
@@ -23,10 +23,12 @@ import java.io.InputStream;
 import java.nio.file.Files;
 import java.util.zip.ZipException;
 
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
 import org.apache.commons.io.IOUtils;
+import org.apache.tika.utils.RereadableInputStream;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -44,51 +46,76 @@ public class ZipSalvager {
      * @param brokenZip
      * @param salvagedZip
      */
-    public static void salvageCopy(InputStream brokenZip, File salvagedZip) {
-        try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(salvagedZip)) {
-            ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(brokenZip);
-            ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
-            while (zae != null) {
+    public static void salvageCopy(InputStream brokenZip, File salvagedZip, boolean allowStoredEntries) throws IOException {
+        if (!(brokenZip instanceof RereadableInputStream)) {
+            brokenZip = new RereadableInputStream(brokenZip, 50000,
+                    true, false);
+        }
+        try {
+            try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(salvagedZip);
+                 ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(brokenZip,
+                         "UTF8", false, allowStoredEntries)) {
+                ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
                 try {
-                    if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) {
-                        //create a new ZAE and copy over only the name so that
-                        //if there is bad info (e.g. CRC) in brokenZip's zae, that
-                        //won't be propagated or cause an exception
-                        outputStream.putArchiveEntry(new ZipArchiveEntry(zae.getName()));
-                        //this will copy an incomplete stream...so there
-                        //could be truncation of the xml/contents, but the zip file
-                        //should be intact.
-                        boolean successfullyCopied = false;
-                        try {
-                            IOUtils.copy(zipArchiveInputStream, outputStream);
-                            successfullyCopied = true;
-                        } catch (IOException e) {
-                            //this can hit a "truncated ZipFile" IOException
-                        }
-                        outputStream.flush();
-                        outputStream.closeArchiveEntry();
-                        if (!successfullyCopied) {
-                            break;
-                        }
+                    processZAE(zae, zipArchiveInputStream, outputStream);
+                } catch (UnsupportedZipFeatureException uzfe) {
+                    if (uzfe.getFeature() ==
+                            UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR) {
+                        //percolate up to allow for retry
+                        throw uzfe;
                     }
-                    zae = zipArchiveInputStream.getNextZipEntry();
+                    //else swallow
                 } catch (ZipException | EOFException e) {
-                    break;
+                    //swallow
                 }
-
+                outputStream.flush();
+                outputStream.finish();
+            } catch (UnsupportedZipFeatureException e) {
+                //percolate up to allow for retry
+                throw e;
+            } catch (IOException e) {
+                LOG.warn("problem fixing zip", e);
             }
-            outputStream.flush();
-            outputStream.finish();
-
+        } catch (UnsupportedZipFeatureException e) {
+            //now retry
+            if (allowStoredEntries == false) {
+                ((RereadableInputStream) brokenZip).rewind();
+                salvageCopy(brokenZip, salvagedZip, true);
+            }
+        }
+    }
 
-        } catch (IOException e) {
-            LOG.warn("problem fixing zip", e);
+    private static void processZAE(ZipArchiveEntry zae, ZipArchiveInputStream zipArchiveInputStream,
+                                   ZipArchiveOutputStream outputStream) throws IOException {
+        while (zae != null) {
+            if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) {
+                //create a new ZAE and copy over only the name so that
+                //if there is bad info (e.g. CRC) in brokenZip's zae, that
+                //won't be propagated or cause an exception
+                outputStream.putArchiveEntry(new ZipArchiveEntry(zae.getName()));
+                //this will copy an incomplete stream...so there
+                //could be truncation of the xml/contents, but the zip file
+                //should be intact.
+                boolean successfullyCopied = false;
+                try {
+                    IOUtils.copy(zipArchiveInputStream, outputStream);
+                    successfullyCopied = true;
+                } catch (IOException e) {
+                    //this can hit a "truncated ZipFile" IOException
+                }
+                outputStream.flush();
+                outputStream.closeArchiveEntry();
+                if (!successfullyCopied) {
+                    break;
+                }
+            }
+            zae = zipArchiveInputStream.getNextZipEntry();
         }
     }
 
     public static void salvageCopy(File brokenZip, File salvagedZip) throws IOException {
         try (InputStream is = Files.newInputStream(brokenZip.toPath())) {
-            salvageCopy(is, salvagedZip);
+            salvageCopy(is, salvagedZip, false);
         }
     }
 }