You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/24 18:12:55 UTC
[tika] 01/01: TIKA-4016 -- add length in InputStreamDigester
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4016
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 29724dbd363dd57f66539f1a096e35d6b1d68e4f
Author: tallison <ta...@apache.org>
AuthorDate: Thu Aug 24 14:12:37 2023 -0400
TIKA-4016 -- add length in InputStreamDigester
---
CHANGES.txt | 4 +++
.../tika/parser/digest/InputStreamDigester.java | 32 ++++++++++++++++------
.../apache/tika/parser/DigestingParserTest.java | 17 ++++++++++++
3 files changed, 45 insertions(+), 8 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 674110b0f..c2f5b298f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,7 @@
+Release 2.9.1 - ??
+
+ * The InputStreamDigester now calculates stream length (TIKA-4016).
+
Release 2.9.0 - 8/23/2023
* With user configuration, the PDFParser can now throw an EncryptedDocumentException
diff --git a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
index 22f89d82a..c3e4fde2c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
@@ -33,6 +33,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.StringUtils;
public class InputStreamDigester implements DigestingParser.Digester {
@@ -72,17 +73,25 @@ public class InputStreamDigester implements DigestingParser.Digester {
/**
* Copied from commons-codec
*/
- private static MessageDigest updateDigest(MessageDigest digest, InputStream data)
+ private static MessageDigest updateDigest(MessageDigest digest, InputStream data, Metadata metadata)
throws IOException {
byte[] buffer = new byte[1024];
-
+ long total = 0;
for (int read = data.read(buffer, 0, 1024); read > -1; read = data.read(buffer, 0, 1024)) {
digest.update(buffer, 0, read);
+ total += read;
}
-
+ setContentLength(total, metadata);
return digest;
}
+ private static void setContentLength(long length, Metadata metadata) {
+ if (StringUtils.isBlank(metadata.get(Metadata.CONTENT_LENGTH))) {
+ //only add it if it hasn't been populated already
+ metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
+ }
+ }
+
private MessageDigest newMessageDigest() {
try {
Provider provider = getProvider();
@@ -128,7 +137,7 @@ public class InputStreamDigester implements DigestingParser.Digester {
//and its size is greater than its mark limit,
//just digest the underlying file.
if (sz > markLimit) {
- digestFile(tis.getFile(), metadata);
+ digestFile(tis.getFile(), sz, metadata);
return;
}
}
@@ -148,12 +157,12 @@ public class InputStreamDigester implements DigestingParser.Digester {
//if the stream wasn't finished -- if the stream was longer than the mark limit --
//spool to File and digest that.
if (tis != null) {
- digestFile(tis.getFile(), metadata);
+ digestFile(tis.getFile(), -1, metadata);
} else {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp, metadata);
- digestFile(tmpTikaInputStream.getFile(), metadata);
+ digestFile(tmpTikaInputStream.getFile(), -1, metadata);
} finally {
try {
tmp.dispose();
@@ -169,7 +178,14 @@ public class InputStreamDigester implements DigestingParser.Digester {
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + algorithmKeyName;
}
- private void digestFile(File f, Metadata m) throws IOException {
+ private void digestFile(File f, long sz, Metadata m) throws IOException {
+ //only add it if it hasn't been populated already
+ if (StringUtils.isBlank(m.get(Metadata.CONTENT_LENGTH))) {
+ if (sz < 0) {
+ sz = f.length();
+ }
+ setContentLength(sz, m);
+ }
try (InputStream is = new FileInputStream(f)) {
digestStream(is, m);
}
@@ -185,7 +201,7 @@ public class InputStreamDigester implements DigestingParser.Digester {
byte[] digestBytes;
MessageDigest messageDigest = newMessageDigest();
- updateDigest(messageDigest, is);
+ updateDigest(messageDigest, is, metadata);
digestBytes = messageDigest.digest();
if (is instanceof BoundedInputStream) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
index 508d6f4d7..7ed78e055 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -30,6 +30,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.Random;
@@ -37,6 +38,7 @@ import org.apache.commons.codec.digest.DigestUtils;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -115,6 +117,21 @@ public class DigestingParserTest extends TikaTest {
assertNull(m.get(P + CommonsDigester.DigestAlgorithm.MD2.toString()));
}
+ @Test
+ public void testLengthsCalculated() throws Exception {
+ //This tests that TIKA-4016 added lengths
+ //before TIKA-4016, lengths were missing from 0, 1 and 11
+ TikaConfig config = null;
+ try (InputStream is = getResourceAsStream("/configs/tika-config-digests.xml")) {
+ config = new TikaConfig(is);
+ }
+ Parser p = new AutoDetectParser(config);
+ List<Metadata> metadataList = getRecursiveMetadata("test_recursive_embedded.docx", p);
+ for (Metadata m : metadataList) {
+ assertNotNull(m.get(Metadata.CONTENT_LENGTH));
+ }
+ }
+
@Test
public void testReset() throws Exception {
String expectedMD5 = "59f626e09a8c16ab6dbc2800c685f772";