You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/24 18:12:54 UTC

[tika] branch TIKA-4016 created (now 29724dbd3)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4016
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 29724dbd3 TIKA-4016 -- add length in InputStreamDigester

This branch includes the following new commits:

     new 29724dbd3 TIKA-4016 -- add length in InputStreamDigester

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-4016 -- add length in InputStreamDigester

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4016
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 29724dbd363dd57f66539f1a096e35d6b1d68e4f
Author: tallison <ta...@apache.org>
AuthorDate: Thu Aug 24 14:12:37 2023 -0400

    TIKA-4016 -- add length in InputStreamDigester
---
 CHANGES.txt                                        |  4 +++
 .../tika/parser/digest/InputStreamDigester.java    | 32 ++++++++++++++++------
 .../apache/tika/parser/DigestingParserTest.java    | 17 ++++++++++++
 3 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 674110b0f..c2f5b298f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,7 @@
+Release 2.9.1 - ??
+
+   * The InputStreamDigester now calculates stream length (TIKA-4016).
+
 Release 2.9.0 - 8/23/2023
 
    * With user configuration, the PDFParser can now throw an EncryptedDocumentException
diff --git a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
index 22f89d82a..c3e4fde2c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
@@ -33,6 +33,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.DigestingParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.StringUtils;
 
 public class InputStreamDigester implements DigestingParser.Digester {
 
@@ -72,17 +73,25 @@ public class InputStreamDigester implements DigestingParser.Digester {
     /**
      * Copied from commons-codec
      */
-    private static MessageDigest updateDigest(MessageDigest digest, InputStream data)
+    private static MessageDigest updateDigest(MessageDigest digest, InputStream data, Metadata metadata)
             throws IOException {
         byte[] buffer = new byte[1024];
-
+        long total = 0;
         for (int read = data.read(buffer, 0, 1024); read > -1; read = data.read(buffer, 0, 1024)) {
             digest.update(buffer, 0, read);
+            total += read;
         }
-
+        setContentLength(total, metadata);
         return digest;
     }
 
+    private static void setContentLength(long length, Metadata metadata) {
+        if (StringUtils.isBlank(metadata.get(Metadata.CONTENT_LENGTH))) {
+            //only add it if it hasn't been populated already
+            metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
+        }
+    }
+
     private MessageDigest newMessageDigest() {
         try {
             Provider provider = getProvider();
@@ -128,7 +137,7 @@ public class InputStreamDigester implements DigestingParser.Digester {
             //and its size is greater than its mark limit,
             //just digest the underlying file.
             if (sz > markLimit) {
-                digestFile(tis.getFile(), metadata);
+                digestFile(tis.getFile(), sz, metadata);
                 return;
             }
         }
@@ -148,12 +157,12 @@ public class InputStreamDigester implements DigestingParser.Digester {
         //if the stream wasn't finished -- if the stream was longer than the mark limit --
         //spool to File and digest that.
         if (tis != null) {
-            digestFile(tis.getFile(), metadata);
+            digestFile(tis.getFile(), -1, metadata);
         } else {
             TemporaryResources tmp = new TemporaryResources();
             try {
                 TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp, metadata);
-                digestFile(tmpTikaInputStream.getFile(), metadata);
+                digestFile(tmpTikaInputStream.getFile(), -1, metadata);
             } finally {
                 try {
                     tmp.dispose();
@@ -169,7 +178,14 @@ public class InputStreamDigester implements DigestingParser.Digester {
                 TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + algorithmKeyName;
     }
 
-    private void digestFile(File f, Metadata m) throws IOException {
+    private void digestFile(File f, long sz, Metadata m) throws IOException {
+        //only add it if it hasn't been populated already
+        if (StringUtils.isBlank(m.get(Metadata.CONTENT_LENGTH))) {
+            if (sz < 0) {
+                sz = f.length();
+            }
+            setContentLength(sz, m);
+        }
         try (InputStream is = new FileInputStream(f)) {
             digestStream(is, m);
         }
@@ -185,7 +201,7 @@ public class InputStreamDigester implements DigestingParser.Digester {
         byte[] digestBytes;
         MessageDigest messageDigest = newMessageDigest();
 
-        updateDigest(messageDigest, is);
+        updateDigest(messageDigest, is, metadata);
         digestBytes = messageDigest.digest();
 
         if (is instanceof BoundedInputStream) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
index 508d6f4d7..7ed78e055 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -30,6 +30,7 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Random;
 
@@ -37,6 +38,7 @@ import org.apache.commons.codec.digest.DigestUtils;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -115,6 +117,21 @@ public class DigestingParserTest extends TikaTest {
         assertNull(m.get(P + CommonsDigester.DigestAlgorithm.MD2.toString()));
     }
 
+    @Test
+    public void testLengthsCalculated() throws Exception {
+        //This tests that TIKA-4016 added lengths
+        //before TIKA-4016, lengths were missing from 0, 1 and 11
+        TikaConfig config = null;
+        try (InputStream is = getResourceAsStream("/configs/tika-config-digests.xml")) {
+            config = new TikaConfig(is);
+        }
+        Parser p = new AutoDetectParser(config);
+        List<Metadata> metadataList = getRecursiveMetadata("test_recursive_embedded.docx", p);
+        for (Metadata m : metadataList) {
+            assertNotNull(m.get(Metadata.CONTENT_LENGTH));
+        }
+    }
+
     @Test
     public void testReset() throws Exception {
         String expectedMD5 = "59f626e09a8c16ab6dbc2800c685f772";