You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ti...@apache.org on 2022/07/18 15:54:13 UTC

[tika] branch main updated: TIKA-3819: add md5 in debug mode

This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 48608b096 TIKA-3819: add md5 in debug mode
48608b096 is described below

commit 48608b0961ceb499ba93ca277b8a296535dbeb57
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Mon Jul 18 17:54:04 2022 +0200

    TIKA-3819: add md5 in debug mode
---
 .../java/org/apache/tika/parser/pdf/PDFParser.java | 36 ++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 2839ed27d..4f2f8a6f0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -16,10 +16,15 @@
  */
 package org.apache.tika.parser.pdf;
 
+import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Files;
 import java.nio.file.Path;
+import java.security.DigestInputStream;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
 import java.util.Arrays;
 import java.util.Calendar;
 import java.util.Collections;
@@ -160,8 +165,9 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
             } else {
                 tstream = TikaInputStream.cast(stream);
             }
-            if (LOG.isDebugEnabled()) {
-                LOG.debug("File: " + tstream.getFile() + ", length: " + tstream.getLength());
+            if (LOG.isDebugEnabled() && tstream != null) {
+                LOG.debug("File: " + tstream.getPath() + ", length: " + tstream.getLength() + 
+                        ", md5: " + calcMD5(tstream.getPath()));
             }
             password = getPassword(metadata, context);
             MemoryUsageSetting memoryUsageSetting = MemoryUsageSetting.setupMainMemoryOnly();
@@ -769,6 +775,32 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         defaultConfig.setImageStrategy(imageStrategy);
     }
 
+    private String calcMD5(Path path) throws IOException {
+        MessageDigest md;
+        try {
+            md = MessageDigest.getInstance("MD5");
+        }
+        catch (NoSuchAlgorithmException ex) {
+            return "No MD5";
+        }
+
+        try (InputStream is = new BufferedInputStream(Files.newInputStream(path));
+                DigestInputStream dis = new DigestInputStream(is, md)) {
+            while (dis.read() >= 0)
+                ;
+        }
+        byte[] digest = md.digest();
+        StringBuilder hexString = new StringBuilder();
+        for (byte by : digest) {
+            int ih = 0xFF & by;
+            if (ih < 16) {
+                hexString.append('0');
+            }
+            hexString.append(Integer.toHexString(ih));
+        }
+        return hexString.toString();
+    }
+
     /**
      * Copied from AcroformDefaultFixup minus generation of appearances and handling of orphan
      * widgets, which we don't need.