You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/10/19 16:03:20 UTC
tika git commit: TIKA-2123: digester fails with multiple digests on
large files
Repository: tika
Updated Branches:
refs/heads/2.x 30e03de89 -> 7e66e4979
TIKA-2123: digester fails with multiple digests on large files
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7e66e497
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7e66e497
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7e66e497
Branch: refs/heads/2.x
Commit: 7e66e49797b7bfcfe7928d442e1d04b924bf2b6c
Parents: 30e03de
Author: tballison <ta...@mitre.org>
Authored: Wed Oct 19 12:03:12 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Oct 19 12:03:12 2016 -0400
----------------------------------------------------------------------
.../apache/tika/parser/DigestingParserTest.java | 176 ++++++++++++++++---
.../tika/parser/digesting/CommonsDigester.java | 78 +++++---
2 files changed, 197 insertions(+), 57 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/7e66e497/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
index 66323d3..5988b86 100644
--- a/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++ b/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -17,15 +17,24 @@
package org.apache.tika.parser;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.IOException;
import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
import java.util.HashMap;
import java.util.Map;
+import java.util.Random;
+import org.apache.commons.codec.digest.DigestUtils;
import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -41,13 +50,17 @@ public class DigestingParserTest extends TikaTest {
private final static String P = TikaCoreProperties.TIKA_META_PREFIX+
"digest"+Metadata.NAMESPACE_PREFIX_DELIMITER;
- private final int UNLIMITED = 1000000;//well, not really, but longer than input file
+ private final static int UNLIMITED = 1000000;//well, not really, but longer than input file
+
+ private final static long SEED = new Random().nextLong();
+
+ private final Random random = new Random(SEED);
private final Parser p = new AutoDetectParser();
@Test
public void testBasic() throws Exception {
Map<CommonsDigester.DigestAlgorithm, String> expected =
- new HashMap<CommonsDigester.DigestAlgorithm, String>();
+ new HashMap<>();
expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f");
expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772");
@@ -90,33 +103,8 @@ public class DigestingParserTest extends TikaTest {
}
@Test
- public void testLimitedRead() throws Exception {
- CommonsDigester.DigestAlgorithm algo = CommonsDigester.DigestAlgorithm.MD5;
- int limit = 100;
- byte[] bytes = new byte[limit];
- InputStream is = getResourceAsStream("/test-documents/test_recursive_embedded.docx");
- is.read(bytes, 0, limit);
- is.close();
- Metadata m = new Metadata();
- try {
- XMLResult xml = getXML(TikaInputStream.get(bytes),
- new DigestingParser(p, new CommonsDigester(100, algo)), m);
- } catch (TikaException e) {
- //thrown because this is just a file fragment
- assertContains("Unexpected RuntimeException from org.apache.tika.parser.microsoft.ooxml.OOXMLParser",
- e.getMessage());
- }
- String expectedMD5 = m.get(P+"MD5");
-
- m = new Metadata();
- XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(100, algo)), m);
- assertEquals(expectedMD5, m.get(P+"MD5"));
- }
-
- @Test
public void testReset() throws Exception {
- String expectedMD5 = "1643c2cef21e36720c54f4f6cb3349d0";
+ String expectedMD5 = "59f626e09a8c16ab6dbc2800c685f772";
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
new DigestingParser(p, new CommonsDigester(100, CommonsDigester.DigestAlgorithm.MD5)), m);
@@ -136,4 +124,134 @@ public class DigestingParserTest extends TikaTest {
assertTrue("Exception not thrown", ex);
}
+ @Test
+ public void testMultipleCombinations() throws Exception {
+ Path tmp = Files.createTempFile("tika-digesting-parser-test", "");
+
+ try {
+ //try some random lengths
+ for (int i = 0; i < 10; i++) {
+ testMulti(tmp, random.nextInt(100000), random.nextInt(100000), random.nextBoolean());
+ }
+ //try specific lengths
+ testMulti(tmp, 1000, 100000, true);
+ testMulti(tmp, 1000, 100000, false);
+ testMulti(tmp, 10000, 10001, true);
+ testMulti(tmp, 10000, 10001, false);
+ testMulti(tmp, 10000, 10000, true);
+ testMulti(tmp, 10000, 10000, false);
+ testMulti(tmp, 10000, 9999, true);
+ testMulti(tmp, 10000, 9999, false);
+
+
+ testMulti(tmp, 1000, 100, true);
+ testMulti(tmp, 1000, 100, false);
+ testMulti(tmp, 1000, 10, true);
+ testMulti(tmp, 1000, 10, false);
+ testMulti(tmp, 1000, 0, true);
+ testMulti(tmp, 1000, 0, false);
+
+ testMulti(tmp, 0, 100, true);
+ testMulti(tmp, 0, 100, false);
+
+ } finally {
+ Files.delete(tmp);
+ }
+ }
+
+ private void testMulti(Path tmp, int fileLength, int markLimit,
+ boolean useTikaInputStream) throws IOException {
+
+ OutputStream os = new BufferedOutputStream(Files.newOutputStream(tmp,
+ StandardOpenOption.CREATE));
+
+ for (int i = 0; i < fileLength; i++) {
+ os.write(random.nextInt());
+ }
+ os.flush();
+ os.close();
+
+ Metadata truth = new Metadata();
+ addTruth(tmp, CommonsDigester.DigestAlgorithm.MD5, truth);
+ addTruth(tmp, CommonsDigester.DigestAlgorithm.SHA1, truth);
+ addTruth(tmp, CommonsDigester.DigestAlgorithm.SHA512, truth);
+
+
+ checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
+ CommonsDigester.DigestAlgorithm.SHA512,
+ CommonsDigester.DigestAlgorithm.SHA1,
+ CommonsDigester.DigestAlgorithm.MD5);
+
+ checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
+ CommonsDigester.DigestAlgorithm.MD5,
+ CommonsDigester.DigestAlgorithm.SHA1);
+
+ checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
+ CommonsDigester.DigestAlgorithm.SHA1,
+ CommonsDigester.DigestAlgorithm.SHA512,
+ CommonsDigester.DigestAlgorithm.MD5);
+
+ checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
+ CommonsDigester.DigestAlgorithm.SHA1);
+
+ checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
+ CommonsDigester.DigestAlgorithm.MD5);
+
+ }
+
+ private void checkMulti(Metadata truth, Path tmp,
+ int fileLength, int markLimit,
+ boolean useTikaInputStream, CommonsDigester.DigestAlgorithm... algos) throws IOException {
+ Metadata result = new Metadata();
+ CommonsDigester digester = new CommonsDigester(markLimit, algos);
+ try (InputStream is = useTikaInputStream ? TikaInputStream.get(tmp) :
+ new BufferedInputStream(Files.newInputStream(tmp))) {
+ digester.digest(is, result, new ParseContext());
+ }
+
+ for (CommonsDigester.DigestAlgorithm algo : algos) {
+ String truthValue = truth.get(P+algo.name());
+ String resultValue = result.get(P+algo.name());
+ assertNotNull("truth", truthValue);
+ assertNotNull("result", resultValue);
+
+ assertEquals("fileLength("+fileLength+") markLimit("+
+ markLimit+") useTikaInputStream("+useTikaInputStream+")"+
+ "algorithm("+algo.name()+") seed("+SEED+")",
+ truthValue, resultValue);
+ }
+
+ }
+
+ private void addTruth(Path tmp, CommonsDigester.DigestAlgorithm algo, Metadata truth) throws IOException {
+ String digest = null;
+ try (InputStream is = Files.newInputStream(tmp)) {
+ switch (algo) {
+ case MD2:
+ digest = DigestUtils.md2Hex(is);
+ break;
+ case MD5:
+ digest = DigestUtils.md5Hex(is);
+ break;
+ case SHA1:
+ digest = DigestUtils.sha1Hex(is);
+ break;
+ case SHA256:
+ digest = DigestUtils.sha256Hex(is);
+ break;
+ case SHA384:
+ digest = DigestUtils.sha384Hex(is);
+ break;
+ case SHA512:
+ digest = DigestUtils.sha512Hex(is);
+ break;
+ default:
+ throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algo.toString());
+ }
+ }
+ truth.set(P+algo.name(), digest);
+
+ }
+
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7e66e497/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java
index e7b2405..6d480a0 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java
@@ -27,6 +27,10 @@ import java.util.List;
import java.util.Locale;
import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -76,34 +80,54 @@ public class CommonsDigester implements DigestingParser.Digester {
@Override
public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
- InputStream tis = TikaInputStream.get(is);
- long sz = -1;
- if (((TikaInputStream)tis).hasFile()) {
- sz = ((TikaInputStream)tis).getLength();
- }
- //if the file is definitely a file,
- //and its size is greater than its mark limit,
- //just digest the underlying file.
- if (sz > markLimit) {
- digestFile(((TikaInputStream)tis).getFile(), m);
- return;
+ //if this is already a TikaInputStream, rely on the caller to close
+ //the stream and free the tmp file.
+ TikaInputStream tis = TikaInputStream.cast(is);
+
+ TemporaryResources tmp = null;
+ if (tis == null) {
+ //if this isn't a TikaInputStream, create a new TempResources
+ //and make sure to release it!!!
+ tmp = new TemporaryResources();
+ tis = TikaInputStream.get(new CloseShieldInputStream(is), tmp);
}
+ try {
+ long sz = -1;
+ if (tis.hasFile()) {
+ sz = tis.getLength();
+ }
+ //if the file is definitely a file,
+ //and its size is greater than its mark limit,
+ //just digest the underlying file.
+ if (sz > markLimit) {
+ digestFile(tis.getFile(), m);
+ return;
+ }
- //try the usual mark/reset stuff.
- //however, if you actually hit the bound,
- //then stop and spool to file via TikaInputStream
- SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, tis);
- boolean finishedStream = false;
- for (DigestAlgorithm algorithm : algorithms) {
- bis.mark(markLimit + 1);
- finishedStream = digestEach(algorithm, bis, m);
- bis.reset();
+ //try the usual mark/reset stuff.
+ //however, if you actually hit the bound,
+ //then stop and spool to file via TikaInputStream
+ SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, tis);
+ boolean finishedStream = false;
+ for (DigestAlgorithm algorithm : algorithms) {
+ bis.mark(markLimit + 1);
+ finishedStream = digestEach(algorithm, bis, m);
+ bis.reset();
+ if (!finishedStream) {
+ break;
+ }
+ }
if (!finishedStream) {
- break;
+ digestFile(tis.getFile(), m);
+ }
+ } finally {
+ try {
+ if (tmp != null) {
+ tmp.dispose();
+ }
+ } catch (TikaException e) {
+ throw new IOExceptionWithCause(e);
}
- }
- if (!finishedStream) {
- digestFile(((TikaInputStream)tis).getFile(), m);
}
}
@@ -170,7 +194,7 @@ public class CommonsDigester implements DigestingParser.Digester {
public static DigestAlgorithm[] parse(String s) {
assert(s != null);
- List<DigestAlgorithm> ret = new ArrayList<DigestAlgorithm>();
+ List<DigestAlgorithm> ret = new ArrayList<>();
for (String algoString : s.split(",")) {
String uc = algoString.toUpperCase(Locale.ROOT);
if (uc.equals(DigestAlgorithm.MD2.toString())) {
@@ -209,7 +233,6 @@ public class CommonsDigester implements DigestingParser.Digester {
private final long max;
private final InputStream in;
private long pos;
- boolean hitBound = false;
private SimpleBoundedInputStream(long max, InputStream in) {
this.max = max;
@@ -219,7 +242,6 @@ public class CommonsDigester implements DigestingParser.Digester {
@Override
public int read() throws IOException {
if (max >= 0 && pos >= max) {
- hitBound = true;
return EOF;
}
final int result = in.read();
@@ -289,7 +311,7 @@ public class CommonsDigester implements DigestingParser.Digester {
}
public boolean hasHitBound() {
- return hitBound;
+ return pos >= max;
}
}
}