You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ti...@apache.org on 2024/02/20 14:24:54 UTC

(tika) branch branch_2x updated (f22eb81d2 -> 49a13b365)

This is an automated email from the ASF dual-hosted git repository.

tilman pushed a change to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git


    from f22eb81d2 TIKA-4162: update aws, spring, protobuf-java
     new 8c0b807b8 TIKA-4199: increase mark buffer size because of commons-compress 1.26.0; replace deprecated
     new de1b2bb15 TIKA-4199: download element to disk because of problems with commons-compress 1.26.0; complete delegate class
     new 49a13b365 TIKA-4199: adjust test

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../tika/parser/iwork/IWorkPackageParser.java      | 14 ++++---
 .../org/apache/tika/parser/pkg/PackageParser.java  | 44 ++++++++++++++++++++++
 .../parser/microsoft/ooxml/TruncatedOOXMLTest.java |  4 +-
 3 files changed, 54 insertions(+), 8 deletions(-)


(tika) 01/03: TIKA-4199: increase mark buffer size because of commons-compress 1.26.0; replace deprecated

Posted by ti...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 8c0b807b89dc5e642f16363dc4595125ded3ca3b
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Tue Feb 20 15:23:17 2024 +0100

    TIKA-4199: increase mark buffer size because of commons-compress 1.26.0; replace deprecated
---
 .../org/apache/tika/parser/iwork/IWorkPackageParser.java   | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
index 4c120f30e..637348db5 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
@@ -76,23 +76,25 @@ public class IWorkPackageParser extends AbstractParser {
                     IWORKDocumentType.KEYNOTE.getType(), IWORKDocumentType.NUMBERS.getType(),
                     IWORKDocumentType.PAGES.getType())));
 
+    @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return supportedTypes;
     }
 
+    @Override
     public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
                       ParseContext context) throws IOException, SAXException, TikaException {
         ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);
-        ZipArchiveEntry entry = zip.getNextZipEntry();
+        ZipArchiveEntry entry = zip.getNextEntry();
 
         while (entry != null) {
             if (!IWORK_CONTENT_ENTRIES.contains(entry.getName())) {
-                entry = zip.getNextZipEntry();
+                entry = zip.getNextEntry();
                 continue;
             }
 
-            InputStream entryStream = new BufferedInputStream(zip, 4096);
-            entryStream.mark(4096);
+            InputStream entryStream = new BufferedInputStream(zip, 9216);
+            entryStream.mark(9216);
             IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
             entryStream.reset();
 
@@ -121,13 +123,13 @@ public class IWorkPackageParser extends AbstractParser {
                 metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
                 xhtml.startDocument();
                 if (contentHandler != null) {
-                    XMLReaderUtils.parseSAX(new CloseShieldInputStream(entryStream),
+                    XMLReaderUtils.parseSAX(CloseShieldInputStream.wrap(entryStream),
                             contentHandler, context);
                 }
                 xhtml.endDocument();
             }
 
-            entry = zip.getNextZipEntry();
+            entry = zip.getNextEntry();
         }
         // Don't close the zip InputStream (TIKA-1117).
     }


(tika) 02/03: TIKA-4199: download element to disk because of problems with commons-compress 1.26.0; complete delegate class

Posted by ti...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit de1b2bb1524ad319d0f2238c18ca150be9a1df01
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Tue Feb 20 15:24:02 2024 +0100

    TIKA-4199: download element to disk because of problems with commons-compress 1.26.0; complete delegate class
---
 .../org/apache/tika/parser/pkg/PackageParser.java  | 44 ++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 3817f4cfb..623ed8875 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -29,6 +29,7 @@ import static org.apache.tika.detect.zip.PackageConstants.ZIP;
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
 import java.nio.charset.Charset;
 import java.util.Collections;
 import java.util.Date;
@@ -448,6 +449,7 @@ public class PackageParser extends AbstractEncodingDetectorParser {
                 TemporaryResources tmp = new TemporaryResources();
                 try {
                     TikaInputStream tis = TikaInputStream.get(archive, tmp, entrydata);
+                    tis.getPath(); // fixes troubles with commons-compress 1.26.0
                     extractor.parseEmbedded(tis, xhtml, entrydata, true);
                 } finally {
                     tmp.dispose();
@@ -517,6 +519,48 @@ public class PackageParser extends AbstractEncodingDetectorParser {
         public void close() throws IOException {
             file.close();
         }
+
+        @Override
+        public byte[] readAllBytes() throws IOException {
+            return in.readAllBytes();
+        }
+
+        @Override
+        public byte[] readNBytes(int len) throws IOException  {
+            return in.readNBytes(len);
+        }
+
+        @Override
+        public int readNBytes(byte[] b, int off, int len) throws IOException {
+            return in.readNBytes(b, off, len);
+        }
+
+        @Override
+        public long skip(long n) throws IOException {
+            return in.skip(n);
+        }
+
+        @Override
+        public int available() throws IOException {
+            return in.available();
+        }
+
+        @Override
+        public synchronized void mark(int readlimit) {
+            in.mark(readlimit);
+        }
+
+        @Override
+        public boolean markSupported() {
+            return in.markSupported();
+        }
+
+        @Override
+        public long transferTo(OutputStream out) throws IOException {
+            return in.transferTo(out);
+        }
+        
+        
     }
 
     /**


(tika) 03/03: TIKA-4199: adjust test

Posted by ti...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 49a13b365849940f8bc92c3e2079ae5ed2710040
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Tue Feb 20 15:24:29 2024 +0100

    TIKA-4199: adjust test
---
 .../org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
index 60662b8ea..c718bb8ff 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
@@ -38,7 +38,7 @@ public class TruncatedOOXMLTest extends TikaTest {
         //this tests that there's a backoff to the pkg parser
         List<Metadata> metadataList =
                 getRecursiveMetadata(truncate("testWORD_various.docx", 13138), true);
-        assertEquals(19, metadataList.size());
+        assertEquals(18, metadataList.size());
         Metadata m = metadataList.get(0);
         assertEquals("application/x-tika-ooxml", m.get(Metadata.CONTENT_TYPE));
     }
@@ -48,7 +48,7 @@ public class TruncatedOOXMLTest extends TikaTest {
         //this is really truncated
         List<Metadata> metadataList =
                 getRecursiveMetadata(truncate("testWORD_various.docx", 774), true);
-        assertEquals(4, metadataList.size());
+        assertEquals(3, metadataList.size());
         Metadata m = metadataList.get(0);
         assertEquals("application/x-tika-ooxml", m.get(Metadata.CONTENT_TYPE));
     }