You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/24 18:12:55 UTC

[tika] 01/01: TIKA-4016 -- add length in InputStreamDigester

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4016
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 29724dbd363dd57f66539f1a096e35d6b1d68e4f
Author: tallison <ta...@apache.org>
AuthorDate: Thu Aug 24 14:12:37 2023 -0400

    TIKA-4016 -- add length in InputStreamDigester
---
 CHANGES.txt                                        |  4 +++
 .../tika/parser/digest/InputStreamDigester.java    | 32 ++++++++++++++++------
 .../apache/tika/parser/DigestingParserTest.java    | 17 ++++++++++++
 3 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 674110b0f..c2f5b298f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,7 @@
+Release 2.9.1 - ??
+
+   * The InputStreamDigester now calculates stream length (TIKA-4016).
+
 Release 2.9.0 - 8/23/2023
 
    * With user configuration, the PDFParser can now throw an EncryptedDocumentException
diff --git a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
index 22f89d82a..c3e4fde2c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
@@ -33,6 +33,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.DigestingParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.StringUtils;
 
 public class InputStreamDigester implements DigestingParser.Digester {
 
@@ -72,17 +73,25 @@ public class InputStreamDigester implements DigestingParser.Digester {
     /**
      * Copied from commons-codec
      */
-    private static MessageDigest updateDigest(MessageDigest digest, InputStream data)
+    private static MessageDigest updateDigest(MessageDigest digest, InputStream data, Metadata metadata)
             throws IOException {
         byte[] buffer = new byte[1024];
-
+        long total = 0;
         for (int read = data.read(buffer, 0, 1024); read > -1; read = data.read(buffer, 0, 1024)) {
             digest.update(buffer, 0, read);
+            total += read;
         }
-
+        setContentLength(total, metadata);
         return digest;
     }
 
+    private static void setContentLength(long length, Metadata metadata) {
+        if (StringUtils.isBlank(metadata.get(Metadata.CONTENT_LENGTH))) {
+            //only add it if it hasn't been populated already
+            metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
+        }
+    }
+
     private MessageDigest newMessageDigest() {
         try {
             Provider provider = getProvider();
@@ -128,7 +137,7 @@ public class InputStreamDigester implements DigestingParser.Digester {
             //and its size is greater than its mark limit,
             //just digest the underlying file.
             if (sz > markLimit) {
-                digestFile(tis.getFile(), metadata);
+                digestFile(tis.getFile(), sz, metadata);
                 return;
             }
         }
@@ -148,12 +157,12 @@ public class InputStreamDigester implements DigestingParser.Digester {
         //if the stream wasn't finished -- if the stream was longer than the mark limit --
         //spool to File and digest that.
         if (tis != null) {
-            digestFile(tis.getFile(), metadata);
+            digestFile(tis.getFile(), -1, metadata);
         } else {
             TemporaryResources tmp = new TemporaryResources();
             try {
                 TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp, metadata);
-                digestFile(tmpTikaInputStream.getFile(), metadata);
+                digestFile(tmpTikaInputStream.getFile(), -1, metadata);
             } finally {
                 try {
                     tmp.dispose();
@@ -169,7 +178,14 @@ public class InputStreamDigester implements DigestingParser.Digester {
                 TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + algorithmKeyName;
     }
 
-    private void digestFile(File f, Metadata m) throws IOException {
+    private void digestFile(File f, long sz, Metadata m) throws IOException {
+        //only add it if it hasn't been populated already
+        if (StringUtils.isBlank(m.get(Metadata.CONTENT_LENGTH))) {
+            if (sz < 0) {
+                sz = f.length();
+            }
+            setContentLength(sz, m);
+        }
         try (InputStream is = new FileInputStream(f)) {
             digestStream(is, m);
         }
@@ -185,7 +201,7 @@ public class InputStreamDigester implements DigestingParser.Digester {
         byte[] digestBytes;
         MessageDigest messageDigest = newMessageDigest();
 
-        updateDigest(messageDigest, is);
+        updateDigest(messageDigest, is, metadata);
         digestBytes = messageDigest.digest();
 
         if (is instanceof BoundedInputStream) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
index 508d6f4d7..7ed78e055 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -30,6 +30,7 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Random;
 
@@ -37,6 +38,7 @@ import org.apache.commons.codec.digest.DigestUtils;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -115,6 +117,21 @@ public class DigestingParserTest extends TikaTest {
         assertNull(m.get(P + CommonsDigester.DigestAlgorithm.MD2.toString()));
     }
 
+    @Test
+    public void testLengthsCalculated() throws Exception {
+        //This tests that TIKA-4016 added lengths
+        //before TIKA-4016, lengths were missing from 0, 1 and 11
+        TikaConfig config = null;
+        try (InputStream is = getResourceAsStream("/configs/tika-config-digests.xml")) {
+            config = new TikaConfig(is);
+        }
+        Parser p = new AutoDetectParser(config);
+        List<Metadata> metadataList = getRecursiveMetadata("test_recursive_embedded.docx", p);
+        for (Metadata m : metadataList) {
+            assertNotNull(m.get(Metadata.CONTENT_LENGTH));
+        }
+    }
+
     @Test
     public void testReset() throws Exception {
         String expectedMD5 = "59f626e09a8c16ab6dbc2800c685f772";