You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ti...@apache.org on 2022/07/18 15:54:13 UTC
[tika] branch main updated: TIKA-3819: add md5 in debug mode
This is an automated email from the ASF dual-hosted git repository.
tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 48608b096 TIKA-3819: add md5 in debug mode
48608b096 is described below
commit 48608b0961ceb499ba93ca277b8a296535dbeb57
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Mon Jul 18 17:54:04 2022 +0200
TIKA-3819: add md5 in debug mode
---
.../java/org/apache/tika/parser/pdf/PDFParser.java | 36 ++++++++++++++++++++--
1 file changed, 34 insertions(+), 2 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 2839ed27d..4f2f8a6f0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -16,10 +16,15 @@
*/
package org.apache.tika.parser.pdf;
+import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.file.Files;
import java.nio.file.Path;
+import java.security.DigestInputStream;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collections;
@@ -160,8 +165,9 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
} else {
tstream = TikaInputStream.cast(stream);
}
- if (LOG.isDebugEnabled()) {
- LOG.debug("File: " + tstream.getFile() + ", length: " + tstream.getLength());
+ if (LOG.isDebugEnabled() && tstream != null) {
+ LOG.debug("File: " + tstream.getPath() + ", length: " + tstream.getLength() +
+ ", md5: " + calcMD5(tstream.getPath()));
}
password = getPassword(metadata, context);
MemoryUsageSetting memoryUsageSetting = MemoryUsageSetting.setupMainMemoryOnly();
@@ -769,6 +775,32 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
defaultConfig.setImageStrategy(imageStrategy);
}
+ private String calcMD5(Path path) throws IOException {
+ MessageDigest md;
+ try {
+ md = MessageDigest.getInstance("MD5");
+ }
+ catch (NoSuchAlgorithmException ex) {
+ return "No MD5";
+ }
+
+ try (InputStream is = new BufferedInputStream(Files.newInputStream(path));
+ DigestInputStream dis = new DigestInputStream(is, md)) {
+ while (dis.read() >= 0)
+ ;
+ }
+ byte[] digest = md.digest();
+ StringBuilder hexString = new StringBuilder();
+ for (byte by : digest) {
+ int ih = 0xFF & by;
+ if (ih < 16) {
+ hexString.append('0');
+ }
+ hexString.append(Integer.toHexString(ih));
+ }
+ return hexString.toString();
+ }
+
/**
* Copied from AcroformDefaultFixup minus generation of appearances and handling of orphan
* widgets, which we don't need.