You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/10/19 16:03:20 UTC

tika git commit: TIKA-2123: digester fails with multiple digests on large files

Repository: tika
Updated Branches:
  refs/heads/2.x 30e03de89 -> 7e66e4979


TIKA-2123: digester fails with multiple digests on large files


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7e66e497
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7e66e497
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7e66e497

Branch: refs/heads/2.x
Commit: 7e66e49797b7bfcfe7928d442e1d04b924bf2b6c
Parents: 30e03de
Author: tballison <ta...@mitre.org>
Authored: Wed Oct 19 12:03:12 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Oct 19 12:03:12 2016 -0400

----------------------------------------------------------------------
 .../apache/tika/parser/DigestingParserTest.java | 176 ++++++++++++++++---
 .../tika/parser/digesting/CommonsDigester.java  |  78 +++++---
 2 files changed, 197 insertions(+), 57 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/7e66e497/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
index 66323d3..5988b86 100644
--- a/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++ b/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -17,15 +17,24 @@
 package org.apache.tika.parser;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Random;
 
+import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -41,13 +50,17 @@ public class DigestingParserTest extends TikaTest {
     private final static String P = TikaCoreProperties.TIKA_META_PREFIX+
             "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER;
 
-    private final int UNLIMITED = 1000000;//well, not really, but longer than input file
+    private final static int UNLIMITED = 1000000;//well, not really, but longer than input file
+
+    private final static long SEED = new Random().nextLong();
+
+    private final Random random = new Random(SEED);
     private final Parser p = new AutoDetectParser();
 
     @Test
     public void testBasic() throws Exception {
         Map<CommonsDigester.DigestAlgorithm, String> expected =
-                new HashMap<CommonsDigester.DigestAlgorithm, String>();
+                new HashMap<>();
 
         expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f");
         expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772");
@@ -90,33 +103,8 @@ public class DigestingParserTest extends TikaTest {
     }
 
     @Test
-    public void testLimitedRead() throws Exception {
-        CommonsDigester.DigestAlgorithm algo = CommonsDigester.DigestAlgorithm.MD5;
-        int limit = 100;
-        byte[] bytes = new byte[limit];
-        InputStream is = getResourceAsStream("/test-documents/test_recursive_embedded.docx");
-        is.read(bytes, 0, limit);
-        is.close();
-        Metadata m = new Metadata();
-        try {
-            XMLResult xml = getXML(TikaInputStream.get(bytes),
-                    new DigestingParser(p, new CommonsDigester(100, algo)), m);
-        } catch (TikaException e) {
-            //thrown because this is just a file fragment
-            assertContains("Unexpected RuntimeException from org.apache.tika.parser.microsoft.ooxml.OOXMLParser",
-                    e.getMessage());
-        }
-        String expectedMD5 = m.get(P+"MD5");
-
-        m = new Metadata();
-        XMLResult xml = getXML("test_recursive_embedded.docx",
-                new DigestingParser(p, new CommonsDigester(100, algo)), m);
-        assertEquals(expectedMD5, m.get(P+"MD5"));
-    }
-
-    @Test
     public void testReset() throws Exception {
-        String expectedMD5 = "1643c2cef21e36720c54f4f6cb3349d0";
+        String expectedMD5 = "59f626e09a8c16ab6dbc2800c685f772";
         Metadata m = new Metadata();
         XMLResult xml = getXML("test_recursive_embedded.docx",
                 new DigestingParser(p, new CommonsDigester(100, CommonsDigester.DigestAlgorithm.MD5)), m);
@@ -136,4 +124,134 @@ public class DigestingParserTest extends TikaTest {
         assertTrue("Exception not thrown", ex);
     }
 
+    @Test
+    public void testMultipleCombinations() throws Exception {
+        Path tmp = Files.createTempFile("tika-digesting-parser-test", "");
+
+        try {
+            //try some random lengths
+            for (int i = 0; i < 10; i++) {
+                testMulti(tmp, random.nextInt(100000), random.nextInt(100000), random.nextBoolean());
+            }
+            //try specific lengths
+            testMulti(tmp, 1000, 100000, true);
+            testMulti(tmp, 1000, 100000, false);
+            testMulti(tmp, 10000, 10001, true);
+            testMulti(tmp, 10000, 10001, false);
+            testMulti(tmp, 10000, 10000, true);
+            testMulti(tmp, 10000, 10000, false);
+            testMulti(tmp, 10000, 9999, true);
+            testMulti(tmp, 10000, 9999, false);
+
+
+            testMulti(tmp, 1000, 100, true);
+            testMulti(tmp, 1000, 100, false);
+            testMulti(tmp, 1000, 10, true);
+            testMulti(tmp, 1000, 10, false);
+            testMulti(tmp, 1000, 0, true);
+            testMulti(tmp, 1000, 0, false);
+
+            testMulti(tmp, 0, 100, true);
+            testMulti(tmp, 0, 100, false);
+
+        } finally {
+            Files.delete(tmp);
+        }
+    }
+
+    private void testMulti(Path tmp, int fileLength, int markLimit,
+                           boolean useTikaInputStream) throws IOException {
+
+        OutputStream os = new BufferedOutputStream(Files.newOutputStream(tmp,
+                StandardOpenOption.CREATE));
+
+        for (int i = 0; i < fileLength; i++) {
+            os.write(random.nextInt());
+        }
+        os.flush();
+        os.close();
+
+        Metadata truth = new Metadata();
+        addTruth(tmp, CommonsDigester.DigestAlgorithm.MD5, truth);
+        addTruth(tmp, CommonsDigester.DigestAlgorithm.SHA1, truth);
+        addTruth(tmp, CommonsDigester.DigestAlgorithm.SHA512, truth);
+
+
+        checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
+                CommonsDigester.DigestAlgorithm.SHA512,
+                CommonsDigester.DigestAlgorithm.SHA1,
+                CommonsDigester.DigestAlgorithm.MD5);
+
+        checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
+                CommonsDigester.DigestAlgorithm.MD5,
+                CommonsDigester.DigestAlgorithm.SHA1);
+
+        checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
+                CommonsDigester.DigestAlgorithm.SHA1,
+                CommonsDigester.DigestAlgorithm.SHA512,
+                CommonsDigester.DigestAlgorithm.MD5);
+
+        checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
+                CommonsDigester.DigestAlgorithm.SHA1);
+
+        checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
+                CommonsDigester.DigestAlgorithm.MD5);
+
+    }
+
+    private void checkMulti(Metadata truth, Path tmp,
+                            int fileLength, int markLimit,
+                            boolean useTikaInputStream, CommonsDigester.DigestAlgorithm... algos) throws IOException {
+        Metadata result = new Metadata();
+        CommonsDigester digester = new CommonsDigester(markLimit, algos);
+        try (InputStream is = useTikaInputStream ? TikaInputStream.get(tmp) :
+                new BufferedInputStream(Files.newInputStream(tmp))) {
+            digester.digest(is, result, new ParseContext());
+        }
+
+        for (CommonsDigester.DigestAlgorithm algo : algos) {
+            String truthValue = truth.get(P+algo.name());
+            String resultValue = result.get(P+algo.name());
+            assertNotNull("truth", truthValue);
+            assertNotNull("result", resultValue);
+
+            assertEquals("fileLength("+fileLength+") markLimit("+
+                    markLimit+") useTikaInputStream("+useTikaInputStream+")"+
+                    "algorithm("+algo.name()+") seed("+SEED+")",
+                    truthValue, resultValue);
+        }
+
+    }
+
+    private void addTruth(Path tmp, CommonsDigester.DigestAlgorithm algo, Metadata truth) throws IOException {
+        String digest = null;
+        try (InputStream is = Files.newInputStream(tmp)) {
+            switch (algo) {
+                case MD2:
+                    digest = DigestUtils.md2Hex(is);
+                    break;
+                case MD5:
+                    digest = DigestUtils.md5Hex(is);
+                    break;
+                case SHA1:
+                    digest = DigestUtils.sha1Hex(is);
+                    break;
+                case SHA256:
+                    digest = DigestUtils.sha256Hex(is);
+                    break;
+                case SHA384:
+                    digest = DigestUtils.sha384Hex(is);
+                    break;
+                case SHA512:
+                    digest = DigestUtils.sha512Hex(is);
+                    break;
+                default:
+                    throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algo.toString());
+            }
+        }
+        truth.set(P+algo.name(), digest);
+
+    }
+
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/7e66e497/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java
index e7b2405..6d480a0 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java
@@ -27,6 +27,10 @@ import java.util.List;
 import java.util.Locale;
 
 import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -76,34 +80,54 @@ public class CommonsDigester implements DigestingParser.Digester {
 
     @Override
     public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
-        InputStream tis = TikaInputStream.get(is);
-        long sz = -1;
-        if (((TikaInputStream)tis).hasFile()) {
-            sz = ((TikaInputStream)tis).getLength();
-        }
-        //if the file is definitely a file,
-        //and its size is greater than its mark limit,
-        //just digest the underlying file.
-        if (sz > markLimit) {
-            digestFile(((TikaInputStream)tis).getFile(), m);
-            return;
+        //if this is already a TikaInputStream, rely on the caller to close
+        //the stream and free the tmp file.
+        TikaInputStream tis = TikaInputStream.cast(is);
+
+        TemporaryResources tmp = null;
+        if (tis == null) {
+            //if this isn't a TikaInputStream, create a new TempResources
+            //and make sure to release it!!!
+            tmp = new TemporaryResources();
+            tis = TikaInputStream.get(new CloseShieldInputStream(is), tmp);
         }
+        try {
+            long sz = -1;
+            if (tis.hasFile()) {
+                sz = tis.getLength();
+            }
+            //if the file is definitely a file,
+            //and its size is greater than its mark limit,
+            //just digest the underlying file.
+            if (sz > markLimit) {
+                digestFile(tis.getFile(), m);
+                return;
+            }
 
-        //try the usual mark/reset stuff.
-        //however, if you actually hit the bound,
-        //then stop and spool to file via TikaInputStream
-        SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, tis);
-        boolean finishedStream = false;
-        for (DigestAlgorithm algorithm : algorithms) {
-            bis.mark(markLimit + 1);
-            finishedStream = digestEach(algorithm, bis, m);
-            bis.reset();
+            //try the usual mark/reset stuff.
+            //however, if you actually hit the bound,
+            //then stop and spool to file via TikaInputStream
+            SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, tis);
+            boolean finishedStream = false;
+            for (DigestAlgorithm algorithm : algorithms) {
+                bis.mark(markLimit + 1);
+                finishedStream = digestEach(algorithm, bis, m);
+                bis.reset();
+                if (!finishedStream) {
+                    break;
+                }
+            }
             if (!finishedStream) {
-                break;
+                digestFile(tis.getFile(), m);
+            }
+        } finally {
+            try {
+                if (tmp != null) {
+                    tmp.dispose();
+                }
+            } catch (TikaException e) {
+                throw new IOExceptionWithCause(e);
             }
-        }
-        if (!finishedStream) {
-            digestFile(((TikaInputStream)tis).getFile(), m);
         }
     }
 
@@ -170,7 +194,7 @@ public class CommonsDigester implements DigestingParser.Digester {
     public static DigestAlgorithm[] parse(String s) {
         assert(s != null);
 
-        List<DigestAlgorithm> ret = new ArrayList<DigestAlgorithm>();
+        List<DigestAlgorithm> ret = new ArrayList<>();
         for (String algoString : s.split(",")) {
             String uc = algoString.toUpperCase(Locale.ROOT);
             if (uc.equals(DigestAlgorithm.MD2.toString())) {
@@ -209,7 +233,6 @@ public class CommonsDigester implements DigestingParser.Digester {
         private final long max;
         private final InputStream in;
         private long pos;
-        boolean hitBound = false;
 
         private SimpleBoundedInputStream(long max, InputStream in) {
             this.max = max;
@@ -219,7 +242,6 @@ public class CommonsDigester implements DigestingParser.Digester {
         @Override
         public int read() throws IOException {
             if (max >= 0 && pos >= max) {
-                hitBound = true;
                 return EOF;
             }
             final int result = in.read();
@@ -289,7 +311,7 @@ public class CommonsDigester implements DigestingParser.Digester {
         }
 
         public boolean hasHitBound() {
-            return hitBound;
+            return pos >= max;
         }
     }
 }