You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/01/04 18:19:51 UTC

(tika) branch branch_2x updated (a1bbad94d -> bd9719e21)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git


    from a1bbad94d TIKA-4162: update slf4j
     new 9e5bdc411 TIKA-4174 correct mime type for sip pcap (#1519)
     new bd9719e21 TIKA-4176 -- throw EncryptedDocumentException for DRM protected epubs (#1530)

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../org/apache/tika/mime/tika-mimetypes.xml        |  5 +-
 .../apache/tika/parser/epub/EncryptionParser.java  | 88 ++++++++++++++++++++++
 .../org/apache/tika/parser/epub/EpubParser.java    | 54 ++++++++++++-
 3 files changed, 143 insertions(+), 4 deletions(-)
 create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java


(tika) 01/02: TIKA-4174 correct mime type for sip pcap (#1519)

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 9e5bdc411782483294fb4d50ac7f40bb3269421a
Author: NissimShiman <57...@users.noreply.github.com>
AuthorDate: Tue Jan 2 10:41:28 2024 -0500

    TIKA-4174 correct mime type for sip pcap (#1519)
    
    (cherry picked from commit 0daea616af12c5f07fc0070418d5b7b180031440)
---
 tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 53808c752..b76adebd1 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -2133,6 +2133,7 @@
     <!-- robots.txt file -->
     <!-- draft: https://datatracker.ietf.org/doc/html/draft-koster-rep -->
     <!-- should have a higher priority than rfc822 - TIKA-3489 -->
+    <!-- and a lower priority than pcap - TIKA-4174 -->
     <magic priority="55">
       <match minShouldMatch="2">
         <match value="user-agent:" type="stringignorecase" offset="0"/>
@@ -2927,7 +2928,7 @@
 
   <mime-type type="application/vnd.tcpdump.pcap">
     <_comment>TCPDump pcap packet capture</_comment>
-    <magic priority="50">
+    <magic priority="60">
       <match value="0xa1b2c3d4" type="big32" offset="0" />
       <match value="0xd4c3b2a1" type="big32" offset="0" />
     </magic>
@@ -2938,7 +2939,7 @@
   <mime-type type="application/vnd.tcpdump.pcapng">
     <_comment>TCPDump next gen pcap packet capture</_comment>
     <tika:link>https://www.ietf.org/staging/draft-tuexen-opsawg-pcapng-02.html</tika:link>
-    <magic priority="50">
+    <magic priority="60">
       <match value="0x0A0D0D0A" type="string" offset="0">
         <!-- Could only find examples of the second. I have not tested the first -->
         <match value="0xa1b2c3d4" type="big32" offset="8" />


(tika) 02/02: TIKA-4176 -- throw EncryptedDocumentException for DRM protected epubs (#1530)

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit bd9719e210bb10814f95997d5a626130f107509f
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue Jan 2 16:37:23 2024 -0500

    TIKA-4176 -- throw EncryptedDocumentException for DRM protected epubs (#1530)
    
    (cherry picked from commit d2530b8f9ea62e4cb24e5ac226732c21431fd5c4)
---
 .../apache/tika/parser/epub/EncryptionParser.java  | 88 ++++++++++++++++++++++
 .../org/apache/tika/parser/epub/EpubParser.java    | 54 ++++++++++++-
 2 files changed, 140 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java
new file mode 100644
index 000000000..26aae7574
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.epub;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.utils.XMLReaderUtils;
+
+public class EncryptionParser implements Parser {
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return Collections.EMPTY_SET;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+                      ParseContext context) throws IOException, SAXException, TikaException {
+
+        try {
+            XMLReaderUtils.parseSAX(stream, new EncryptionHandler(), context);
+        } catch (SAXException e) {
+            if (e.getCause() instanceof EncryptedDocumentException) {
+                throw (EncryptedDocumentException)e.getCause();
+            }
+        }
+    }
+
+    private class EncryptionHandler extends DefaultHandler {
+        Set<String> encryptedItems = new HashSet<>();
+        @Override
+        public void startElement(String uri, String localName, String qName, Attributes attributes) {
+            if ("CipherReference".equals(localName)) {
+                String encryptedUri = XMLReaderUtils.getAttrValue("URI", attributes);
+                encryptedItems.add(encryptedUri);
+            }
+        }
+
+        @Override
+        public void endDocument() throws SAXException {
+            if (encryptedItems.size() > 0) {
+                StringBuilder sb = new StringBuilder();
+                sb.append("EPUB contains encrypted items: ");
+                int added = 0;
+                for (String u : encryptedItems) {
+                    if (sb.length() > 500) {
+                        sb.append(" and others...");
+                        break;
+                    }
+                    if (added++ > 0) {
+                        sb.append(", ");
+                    }
+                    sb.append(u);
+                }
+                throw new SAXException(new EncryptedDocumentException(sb.toString()));
+            }
+        }
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index 75e5eaf69..1bdd95750 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -46,7 +46,9 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
 import org.apache.tika.config.Field;
+import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.FilenameUtils;
@@ -78,6 +80,8 @@ public class EpubParser extends AbstractParser {
     private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
             new HashSet<>(Arrays.asList(MediaType.application("epub+zip"),
                     MediaType.application("x-ibooks+zip"))));
+
+    private static final String META_INF_ENCRYPTION = "META-INF/encryption.xml";
     @Field
     boolean streaming = false;
     private Parser meta = new DcXMLParser();
@@ -101,6 +105,11 @@ public class EpubParser extends AbstractParser {
         this.content = content;
     }
 
+    @Field
+    public void setStreaming(boolean streaming) {
+        this.streaming = streaming;
+    }
+
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
@@ -136,22 +145,37 @@ public class EpubParser extends AbstractParser {
     private void streamingParse(InputStream stream, ContentHandler bodyHandler, Metadata metadata,
                                 ParseContext context)
             throws IOException, TikaException, SAXException {
-        ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
+        ZipArchiveInputStream zip = new ZipArchiveInputStream(stream, "UTF-8", false, true, false);
 
         ZipArchiveEntry entry = zip.getNextZipEntry();
+        SAXException sax = null;
         while (entry != null) {
             if (entry.getName().equals("mimetype")) {
                 updateMimeType(zip, metadata);
+            } else if (entry.getName().equals(META_INF_ENCRYPTION)) {
+                checkForDRM(zip);
             } else if (entry.getName().equals("metadata.xml")) {
                 meta.parse(zip, new DefaultHandler(), metadata, context);
             } else if (entry.getName().endsWith(".opf")) {
                 opf.parse(zip, new DefaultHandler(), metadata, context);
             } else if (entry.getName().endsWith(".htm") || entry.getName().endsWith(".html") ||
                     entry.getName().endsWith(".xhtml") || entry.getName().endsWith(".xml")) {
-                content.parse(zip, bodyHandler, metadata, context);
+                try {
+                    content.parse(zip, bodyHandler, metadata, context);
+                } catch (SAXException e) {
+                    if (WriteLimitReachedException.isWriteLimitReached(e)) {
+                        throw e;
+                    }
+                    if (sax == null) {
+                        sax = e;
+                    }
+                }
             }
             entry = zip.getNextZipEntry();
         }
+        if (sax != null) {
+            throw sax;
+        }
     }
 
     private void updateMimeType(InputStream is, Metadata metadata) throws IOException {
@@ -224,6 +248,7 @@ public class EpubParser extends AbstractParser {
                                          XHTMLContentHandler xhtml, Metadata metadata,
                                          ParseContext context, boolean isStrict)
             throws IOException, TikaException, SAXException {
+
         String rootOPF = getRoot(zipFile, context);
         if (rootOPF == null) {
             return false;
@@ -266,6 +291,7 @@ public class EpubParser extends AbstractParser {
         }
 
         extractMetadata(zipFile, metadata, context);
+        checkForDRM(zipFile);
         Set<String> processed = new HashSet<>();
         for (String id : contentOrderScraper.contentItems) {
             HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
@@ -309,6 +335,30 @@ public class EpubParser extends AbstractParser {
         return true;
     }
 
+    private void checkForDRM(ZipFile zipFile) throws IOException, EncryptedDocumentException {
+        ZipArchiveEntry zae = zipFile.getEntry(META_INF_ENCRYPTION);
+        if (zae == null) {
+            return;
+        }
+        try (InputStream is = zipFile.getInputStream(zae)) {
+            new EncryptionParser().parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
+        } catch (EncryptedDocumentException e) {
+            throw e;
+        } catch (TikaException | SAXException e) {
+            //swallow ?!
+        }
+    }
+
+    private void checkForDRM(InputStream is) throws IOException, EncryptedDocumentException {
+        try {
+            new EncryptionParser().parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
+        } catch (EncryptedDocumentException e) {
+            throw e;
+        } catch (TikaException | SAXException e) {
+            //swallow ?!
+        }
+    }
+
     private boolean shouldHandleEmbedded(String media) {
         if (media == null) {
             return true;