You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/06/05 18:16:26 UTC
[tika] branch master updated: TIKA-2386 -- enable more options for
DigestingParser
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 5410928 TIKA-2386 -- enable more options for DigestingParser
5410928 is described below
commit 5410928d740baf6cdcf6ce225f105482f3a2fc11
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Jun 5 14:16:16 2017 -0400
TIKA-2386 -- enable more options for DigestingParser
---
CHANGES.txt | 5 +
.../batch/builders/AppParserFactoryBuilder.java | 15 +-
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 14 +-
.../org/apache/tika/parser/DigestingParser.java | 13 +-
.../tika/parser/digest/CompositeDigester.java | 56 ++++
.../tika/parser/digest/InputStreamDigester.java | 254 +++++++--------
.../tika/parser/utils/BouncyCastleDigester.java | 101 ++++++
.../apache/tika/parser/utils/CommonsDigester.java | 356 +++++++--------------
...t.java => BouncyCastleDigestingParserTest.java} | 186 ++++++-----
.../apache/tika/parser/DigestingParserTest.java | 54 ++--
.../java/org/apache/tika/server/TikaServerCli.java | 15 +-
.../java/org/apache/tika/server/CXFTestBase.java | 2 +-
.../org/apache/tika/server/TikaResourceTest.java | 2 +
13 files changed, 562 insertions(+), 511 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index a9ffd32..dabf119 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,8 @@
+Release 1.15.1 - ??/??/????
+
+ * Enable base32 encoding of digests and enable BouncyCastle implementations
+ of digest algorithms (TIKA-2386).
+
Release 1.15 - 05/23/2017
* Tika now has a module for Deep Learning powered by the
diff --git a/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java b/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
index 998f649..ec05a46 100644
--- a/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
+++ b/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
@@ -23,6 +23,7 @@ import java.util.Map;
import org.apache.tika.batch.DigestingAutoDetectParserFactory;
import org.apache.tika.batch.ParserFactory;
import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.utils.BouncyCastleDigester;
import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.util.ClassLoaderUtil;
import org.apache.tika.util.XMLDOMUtil;
@@ -55,8 +56,6 @@ public class AppParserFactoryBuilder implements IParserFactoryBuilder {
}
private DigestingParser.Digester buildDigester(Map<String, String> localAttrs) {
- String digestString = localAttrs.get("digest");
- CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse(digestString);
String readLimitString = localAttrs.get("digestMarkLimit");
if (readLimitString == null) {
@@ -71,6 +70,16 @@ public class AppParserFactoryBuilder implements IParserFactoryBuilder {
throw new IllegalArgumentException("Parameter \"digestMarkLimit\" must be a parseable int: "+
readLimitString);
}
- return new CommonsDigester(readLimit, algos);
+ String digestString = localAttrs.get("digest");
+ try {
+ return new CommonsDigester(readLimit, digestString);
+ } catch (IllegalArgumentException commonsException) {
+ try {
+ return new BouncyCastleDigester(readLimit, digestString);
+ } catch (IllegalArgumentException bcException) {
+ throw new IllegalArgumentException("Tried both CommonsDigester ("+commonsException.getMessage()+
+ ") and BouncyCastleDigester ("+bcException.getMessage()+")", bcException);
+ }
+ }
}
}
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 707037b..1c9f9ab 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -18,6 +18,11 @@ package org.apache.tika.cli;
import static java.nio.charset.StandardCharsets.UTF_8;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
@@ -52,12 +57,6 @@ import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.TransformerConfigurationException;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
-
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
@@ -406,9 +405,8 @@ public class TikaCLI {
} else if (arg.startsWith("--config=")) {
configure(arg.substring("--config=".length()));
} else if (arg.startsWith("--digest=")) {
- CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse(
+ digester = new CommonsDigester(MAX_MARK,
arg.substring("--digest=".length()));
- digester = new CommonsDigester(MAX_MARK,algos);
parser = new DigestingParser(parser, digester);
} else if (arg.startsWith("-e")) {
encoding = arg.substring("-e".length());
diff --git a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
index 08b028e..0e4c8c8 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
@@ -31,8 +31,8 @@ import org.xml.sax.SAXException;
public class DigestingParser extends ParserDecorator {
/**
- * Interface for optional digester, if specified during construction.
- * See org.apache.parser.utils.CommonsDigester in tika-parsers for an
+ * Interface for digester. See
+ * org.apache.parser.utils.CommonsDigester in tika-parsers for an
* implementation.
*/
public interface Digester {
@@ -53,10 +53,15 @@ public class DigestingParser extends ParserDecorator {
* @throws IOException
*/
void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException;
-
-
};
+ /**
+ * Encodes byte array from a MessageDigest to String
+ */
+ public interface Encoder {
+ String encode(byte[] bytes);
+ }
+
private final Digester digester;
/**
* Creates a decorator for the given parser.
diff --git a/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java
new file mode 100644
index 0000000..f41d98b
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.digest;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.ParseContext;
+
+
+public class CompositeDigester implements DigestingParser.Digester {
+
+ private final DigestingParser.Digester[] digesters;
+
+ public CompositeDigester(DigestingParser.Digester ... digesters) {
+ this.digesters = digesters;
+ }
+
+ @Override
+ public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
+ TemporaryResources tmp = new TemporaryResources();
+ TikaInputStream tis = TikaInputStream.get(is, tmp);
+ try {
+ for (DigestingParser.Digester digester : digesters) {
+ digester.digest(tis, m, parseContext);
+ }
+ } finally {
+ try {
+ tmp.dispose();
+ } catch (TikaException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ }
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
similarity index 50%
copy from tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
copy to tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
index 846ab72..40a92a6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
@@ -1,5 +1,3 @@
-package org.apache.tika.parser.utils;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -17,17 +15,16 @@ package org.apache.tika.parser.utils;
* limitations under the License.
*/
+package org.apache.tika.parser.digest;
+
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Locale;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.security.Provider;
-import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOExceptionWithCause;
import org.apache.tika.io.TemporaryResources;
@@ -36,55 +33,80 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Implementation of {@link org.apache.tika.parser.DigestingParser.Digester}
- * that relies on commons.codec.digest.DigestUtils to calculate digest hashes.
- * <p>
- * This digester tries to use the regular mark/reset protocol on the InputStream.
- * However, this wraps an internal BoundedInputStream, and if the InputStream
- * is not fully read, then this will reset the stream and
- * spool the InputStream to disk (via TikaInputStream) and then digest the file.
- * <p>
- * If a TikaInputStream is passed in and it has an underlying file that is longer
- * than the {@link #markLimit}, then this digester digests the file directly.
- */
-public class CommonsDigester implements DigestingParser.Digester {
- private static final Logger LOG = LoggerFactory.getLogger(CommonsDigester.class);
+public class InputStreamDigester implements DigestingParser.Digester {
+ private final String algorithm;
+ private final String algorithmKeyName;
+ private final DigestingParser.Encoder encoder;
+ private final int markLimit;
- public enum DigestAlgorithm {
- //those currently available in commons.digest
- MD2,
- MD5,
- SHA1,
- SHA256,
- SHA384,
- SHA512;
-
- String getMetadataKey() {
- return TikaCoreProperties.TIKA_META_PREFIX +
- "digest" + Metadata.NAMESPACE_PREFIX_DELIMITER + this.toString();
- }
+ public InputStreamDigester(int markLimit, String algorithm,
+ DigestingParser.Encoder encoder) {
+ this(markLimit, algorithm, algorithm, encoder);
}
- private final List<DigestAlgorithm> algorithms = new ArrayList<DigestAlgorithm>();
- private final int markLimit;
+ /**
+ *
+ * @param markLimit limit in bytes to allow for mark/reset. If the inputstream is longer
+ * than this limit, the stream will be reset and then spooled to a temporary file.
+ * Throws IllegalArgumentException if < 0.
+ * @param algorithm name of the digest algorithm to retrieve from the Provider
+ * @param algorithmKeyName name of the algorithm to store
+ * as part of the key in the metadata
+ * when {@link #digest(InputStream, Metadata, ParseContext)} is called
+ * @param encoder encoder to convert the byte array returned from the digester to a string
+ */
+ public InputStreamDigester(int markLimit, String algorithm, String algorithmKeyName,
+ DigestingParser.Encoder encoder) {
+ this.algorithm = algorithm;
+ this.algorithmKeyName = algorithmKeyName;
+ this.encoder = encoder;
+ this.markLimit = markLimit;
- public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) {
- Collections.addAll(this.algorithms, algorithms);
if (markLimit < 0) {
throw new IllegalArgumentException("markLimit must be >= 0");
}
- this.markLimit = markLimit;
}
- @Override
- public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
+ private MessageDigest newMessageDigest() {
+ try {
+ Provider provider = getProvider();
+ if (provider == null) {
+ return MessageDigest.getInstance(algorithm);
+ } else {
+ return MessageDigest.getInstance(algorithm, provider);
+ }
+ } catch (NoSuchAlgorithmException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+ /**
+ *
+ * When subclassing this, becare to ensure that your provider is
+ * thread-safe (not likely) or return a new provider with each call.
+ *
+ *
+ * @return provider to use to get the MessageDigest from the algorithm name.
+ * Default is to return null.
+ */
+ protected Provider getProvider() {
+ return null;
+ }
+
+ /**
+ *
+ * @param is InputStream to digest. Best to use a TikaInputStream because
+ * of potential need to spool to disk. InputStream must
+ * support mark/reset.
+ * @param metadata metadata in which to store the digest information
+ * @param parseContext ParseContext -- not actually used yet, but there for future expansion
+ * @throws IOException on IO problem or IllegalArgumentException if algorithm couldn't be found
+ */
+ @Override
+ public void digest(InputStream is, Metadata metadata,
+ ParseContext parseContext) throws IOException {
TikaInputStream tis = TikaInputStream.cast(is);
if (tis != null && tis.hasFile()) {
long sz = -1;
@@ -95,145 +117,97 @@ public class CommonsDigester implements DigestingParser.Digester {
//and its size is greater than its mark limit,
//just digest the underlying file.
if (sz > markLimit) {
- digestFile(tis.getFile(), m);
+ digestFile(tis.getFile(), metadata);
return;
}
}
+
//try the usual mark/reset stuff.
//however, if you actually hit the bound,
//then stop and spool to file via TikaInputStream
SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, is);
boolean finishedStream = false;
- for (DigestAlgorithm algorithm : algorithms) {
- bis.mark(markLimit + 1);
- finishedStream = digestEach(algorithm, bis, m);
- bis.reset();
- if (!finishedStream) {
- break;
- }
+ bis.mark(markLimit + 1);
+ finishedStream = digestStream(bis, metadata);
+ bis.reset();
+ if (finishedStream) {
+ return;
}
//if the stream wasn't finished -- if the stream was longer than the mark limit --
//spool to File and digest that.
- if (!finishedStream) {
- if (tis != null) {
- digestFile(tis.getFile(), m);
- } else {
- TemporaryResources tmp = new TemporaryResources();
+ if (tis != null) {
+ digestFile(tis.getFile(), metadata);
+ } else {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp);
+ digestFile(tmpTikaInputStream.getFile(), metadata);
+ } finally {
try {
- TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp);
- digestFile(tmpTikaInputStream.getFile(), m);
- } finally {
- try {
- tmp.dispose();
- } catch (TikaException e) {
- throw new IOExceptionWithCause(e);
- }
+ tmp.dispose();
+ } catch (TikaException e) {
+ throw new IOExceptionWithCause(e);
}
}
}
+ }
+
+ private String getMetadataKey() {
+ return TikaCoreProperties.TIKA_META_PREFIX +
+ "digest" + Metadata.NAMESPACE_PREFIX_DELIMITER +
+ algorithmKeyName;
}
private void digestFile(File f, Metadata m) throws IOException {
- for (DigestAlgorithm algorithm : algorithms) {
- InputStream is = new FileInputStream(f);
- try {
- digestEach(algorithm, is, m);
- } finally {
- IOUtils.closeQuietly(is);
- }
+ try (InputStream is = new FileInputStream(f)) {
+ digestStream(is, m);
}
}
/**
- * @param algorithm algo to use
- * @param is input stream to read from
- * @param metadata metadata for reporting the digest
+ * @param is input stream to read from
+ * @param metadata metadata for reporting the digest
* @return whether or not this finished the input stream
* @throws IOException
*/
- private boolean digestEach(DigestAlgorithm algorithm,
- InputStream is, Metadata metadata) throws IOException {
- String digest = null;
- try {
- switch (algorithm) {
- case MD2:
- digest = DigestUtils.md2Hex(is);
- break;
- case MD5:
- digest = DigestUtils.md5Hex(is);
- break;
- case SHA1:
- digest = DigestUtils.sha1Hex(is);
- break;
- case SHA256:
- digest = DigestUtils.sha256Hex(is);
- break;
- case SHA384:
- digest = DigestUtils.sha384Hex(is);
- break;
- case SHA512:
- digest = DigestUtils.sha512Hex(is);
- break;
- default:
- throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algorithm.toString());
- }
- } catch (IOException e) {
- LOG.warn("Problem digesting", e);
- //swallow, or should we throw this?
- }
+ private boolean digestStream(InputStream is, Metadata metadata) throws IOException {
+ byte[] digestBytes;
+ MessageDigest messageDigest = newMessageDigest();
+
+ updateDigest(messageDigest, is);
+ digestBytes = messageDigest.digest();
+
if (is instanceof SimpleBoundedInputStream) {
if (((SimpleBoundedInputStream) is).hasHitBound()) {
return false;
}
}
- metadata.set(algorithm.getMetadataKey(), digest);
+ metadata.set(getMetadataKey(), encoder.encode(digestBytes));
return true;
}
+
/**
- * @param s comma-delimited (no space) list of algorithms to use: md5,sha256
- * @return
+ * Copied from commons-codec
*/
- public static DigestAlgorithm[] parse(String s) {
- assert (s != null);
-
- List<DigestAlgorithm> ret = new ArrayList<>();
- for (String algoString : s.split(",")) {
- String uc = algoString.toUpperCase(Locale.ROOT);
- if (uc.equals(DigestAlgorithm.MD2.toString())) {
- ret.add(DigestAlgorithm.MD2);
- } else if (uc.equals(DigestAlgorithm.MD5.toString())) {
- ret.add(DigestAlgorithm.MD5);
- } else if (uc.equals(DigestAlgorithm.SHA1.toString())) {
- ret.add(DigestAlgorithm.SHA1);
- } else if (uc.equals(DigestAlgorithm.SHA256.toString())) {
- ret.add(DigestAlgorithm.SHA256);
- } else if (uc.equals(DigestAlgorithm.SHA384.toString())) {
- ret.add(DigestAlgorithm.SHA384);
- } else if (uc.equals(DigestAlgorithm.SHA512.toString())) {
- ret.add(DigestAlgorithm.SHA512);
- } else {
- StringBuilder sb = new StringBuilder();
- int i = 0;
- for (DigestAlgorithm algo : DigestAlgorithm.values()) {
- if (i++ > 0) {
- sb.append(", ");
- }
- sb.append(algo.toString());
- }
- throw new IllegalArgumentException("Couldn't match " + s + " with any of: " + sb.toString());
- }
+ private static MessageDigest updateDigest(MessageDigest digest, InputStream data) throws IOException {
+ byte[] buffer = new byte[1024];
+
+ for (int read = data.read(buffer, 0, 1024); read > -1; read = data.read(buffer, 0, 1024)) {
+ digest.update(buffer, 0, read);
}
- return ret.toArray(new DigestAlgorithm[ret.size()]);
+
+ return digest;
}
+
/**
* Very slight modification of Commons' BoundedInputStream
* so that we can figure out if this hit the bound or not.
*/
- private class SimpleBoundedInputStream extends InputStream {
+ private static class SimpleBoundedInputStream extends InputStream {
private final static int EOF = -1;
private final long max;
private final InputStream in;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/utils/BouncyCastleDigester.java b/tika-parsers/src/main/java/org/apache/tika/parser/utils/BouncyCastleDigester.java
new file mode 100644
index 0000000..2b6529c
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/utils/BouncyCastleDigester.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.utils;
+
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.security.Provider;
+
+import org.apache.commons.codec.binary.Base32;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.digest.CompositeDigester;
+import org.apache.tika.parser.digest.InputStreamDigester;
+import org.bouncycastle.jce.provider.BouncyCastleProvider;
+import org.bouncycastle.util.encoders.Hex;
+
+/**
+ * Digester that relies on BouncyCastle for MessageDigest implementations.
+ *
+ */
+public class BouncyCastleDigester extends CompositeDigester {
+
+ /**
+ * Include a string representing the comma-separated algorithms to run: e.g. "md5,sha1".
+ * If you want base 32 encoding instead of hexadecimal, add ":32" to the algorithm, e.g. "md5,sha1:32"
+ * <p/>
+ * Will throw an IllegalArgumentException if an algorithm isn't supported
+ * @param markLimit
+ * @param algorithmString
+ */
+ public BouncyCastleDigester(int markLimit, String algorithmString) {
+ super(buildDigesters(markLimit, algorithmString));
+ }
+
+ private static DigestingParser.Digester[] buildDigesters(int markLimit, String digesterDef) {
+ String[] digests = digesterDef.split(",");
+ DigestingParser.Digester[] digesters = new DigestingParser.Digester[digests.length];
+ int i = 0;
+ for (String digest : digests) {
+ String[] parts = digest.split(":");
+ DigestingParser.Encoder encoder = null;
+ if (parts.length > 1) {
+ if (parts[1].equals("16")) {
+ encoder = new HexEncoder();
+ } else if (parts[1].equals("32")) {
+ encoder = new Base32Encoder();
+ } else {
+ throw new IllegalArgumentException("Value must be '16' or '32'");
+ }
+ } else {
+ encoder = new HexEncoder();
+ }
+ digesters[i++] = new BCInputStreamDigester(markLimit, parts[0], encoder);
+ }
+ return digesters;
+ }
+
+ private static class HexEncoder implements DigestingParser.Encoder {
+ @Override
+ public String encode(byte[] bytes) {
+ return Hex.toHexString(bytes);
+ }
+ }
+
+ private static class Base32Encoder implements DigestingParser.Encoder {
+ @Override
+ public String encode(byte[] bytes) {
+ return new Base32().encodeToString(bytes);
+ }
+ }
+
+ private static class BCInputStreamDigester extends InputStreamDigester {
+
+ public BCInputStreamDigester(int markLimit, String algorithm, DigestingParser.Encoder encoder) {
+ super(markLimit, algorithm, encoder);
+ try {
+ MessageDigest.getInstance(algorithm, getProvider());
+ } catch (NoSuchAlgorithmException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
+
+ @Override
+ protected Provider getProvider() {
+ return new BouncyCastleProvider();
+ }
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java b/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
index 846ab72..a467651 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
@@ -1,5 +1,3 @@
-package org.apache.tika.parser.utils;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,28 +14,19 @@ package org.apache.tika.parser.utils;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+package org.apache.tika.parser.utils;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
import java.util.ArrayList;
-import java.util.Collections;
import java.util.List;
import java.util.Locale;
-import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOExceptionWithCause;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
+import org.apache.commons.codec.binary.Base32;
+import org.apache.commons.codec.binary.Hex;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.DigestingParser;
-import org.apache.tika.parser.ParseContext;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.tika.parser.digest.CompositeDigester;
+import org.apache.tika.parser.digest.InputStreamDigester;
/**
* Implementation of {@link org.apache.tika.parser.DigestingParser.Digester}
@@ -47,280 +36,151 @@ import org.slf4j.LoggerFactory;
* However, this wraps an internal BoundedInputStream, and if the InputStream
* is not fully read, then this will reset the stream and
* spool the InputStream to disk (via TikaInputStream) and then digest the file.
- * <p>
- * If a TikaInputStream is passed in and it has an underlying file that is longer
- * than the {@link #markLimit}, then this digester digests the file directly.
*/
-public class CommonsDigester implements DigestingParser.Digester {
-
- private static final Logger LOG = LoggerFactory.getLogger(CommonsDigester.class);
-
+public class CommonsDigester extends CompositeDigester {
public enum DigestAlgorithm {
//those currently available in commons.digest
- MD2,
- MD5,
- SHA1,
- SHA256,
- SHA384,
- SHA512;
+ MD2("MD2"),
+ MD5("MD5"),
+ SHA1("SHA-1"),
+ SHA256("SHA-256"),
+ SHA384("SHA-384"),
+ SHA512("SHA-512");
- String getMetadataKey() {
- return TikaCoreProperties.TIKA_META_PREFIX +
- "digest" + Metadata.NAMESPACE_PREFIX_DELIMITER + this.toString();
- }
- }
+ private final String javaName;
- private final List<DigestAlgorithm> algorithms = new ArrayList<DigestAlgorithm>();
- private final int markLimit;
-
- public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) {
- Collections.addAll(this.algorithms, algorithms);
- if (markLimit < 0) {
- throw new IllegalArgumentException("markLimit must be >= 0");
+ DigestAlgorithm(String javaName) {
+ this.javaName = javaName;
}
- this.markLimit = markLimit;
- }
-
- @Override
- public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
- TikaInputStream tis = TikaInputStream.cast(is);
- if (tis != null && tis.hasFile()) {
- long sz = -1;
- if (tis.hasFile()) {
- sz = tis.getLength();
- }
- //if the inputstream has a file,
- //and its size is greater than its mark limit,
- //just digest the underlying file.
- if (sz > markLimit) {
- digestFile(tis.getFile(), m);
- return;
- }
+ String getJavaName() {
+ return javaName;
}
-
- //try the usual mark/reset stuff.
- //however, if you actually hit the bound,
- //then stop and spool to file via TikaInputStream
- SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, is);
- boolean finishedStream = false;
- for (DigestAlgorithm algorithm : algorithms) {
- bis.mark(markLimit + 1);
- finishedStream = digestEach(algorithm, bis, m);
- bis.reset();
- if (!finishedStream) {
- break;
- }
- }
- //if the stream wasn't finished -- if the stream was longer than the mark limit --
- //spool to File and digest that.
- if (!finishedStream) {
- if (tis != null) {
- digestFile(tis.getFile(), m);
- } else {
- TemporaryResources tmp = new TemporaryResources();
- try {
- TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp);
- digestFile(tmpTikaInputStream.getFile(), m);
- } finally {
- try {
- tmp.dispose();
- } catch (TikaException e) {
- throw new IOExceptionWithCause(e);
- }
- }
- }
+ String getMetadataKey() {
+ return TikaCoreProperties.TIKA_META_PREFIX +
+ "digest" + Metadata.NAMESPACE_PREFIX_DELIMITER + this.toString();
}
-
}
- private void digestFile(File f, Metadata m) throws IOException {
- for (DigestAlgorithm algorithm : algorithms) {
- InputStream is = new FileInputStream(f);
- try {
- digestEach(algorithm, is, m);
- } finally {
- IOUtils.closeQuietly(is);
- }
- }
+ /**
+ * Include a string representing the comma-separated algorithms to run: e.g. "md5,sha1".
+ * If you want base 32 encoding instead of hexadecimal, add ":32" to the algorithm, e.g. "md5,sha1:32"
+ * <p/>
+ * Will throw an IllegalArgumentException if an algorithm isn't supported
+ * @param markLimit
+ * @param algorithmString
+ */
+ public CommonsDigester(int markLimit, String algorithmString) {
+ super(buildDigesters(markLimit, algorithmString));
}
/**
- * @param algorithm algo to use
- * @param is input stream to read from
- * @param metadata metadata for reporting the digest
- * @return whether or not this finished the input stream
- * @throws IOException
+ *
+ * @param markLimit limit for mark/reset; after this limit is hit, the
+ * stream is reset and spooled to disk
+ * @param algorithms algorithms to run
+ * @deprecated use {@link #CommonsDigester(int, String)}
*/
- private boolean digestEach(DigestAlgorithm algorithm,
- InputStream is, Metadata metadata) throws IOException {
- String digest = null;
- try {
- switch (algorithm) {
- case MD2:
- digest = DigestUtils.md2Hex(is);
- break;
- case MD5:
- digest = DigestUtils.md5Hex(is);
- break;
- case SHA1:
- digest = DigestUtils.sha1Hex(is);
- break;
- case SHA256:
- digest = DigestUtils.sha256Hex(is);
- break;
- case SHA384:
- digest = DigestUtils.sha384Hex(is);
- break;
- case SHA512:
- digest = DigestUtils.sha512Hex(is);
- break;
- default:
- throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algorithm.toString());
- }
- } catch (IOException e) {
- LOG.warn("Problem digesting", e);
- //swallow, or should we throw this?
- }
- if (is instanceof SimpleBoundedInputStream) {
- if (((SimpleBoundedInputStream) is).hasHitBound()) {
- return false;
- }
+ public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) {
+ super(buildDigesters(markLimit, algorithms));
+ }
+
+ private static DigestingParser.Digester[] buildDigesters(int markLimit, DigestAlgorithm[] algorithms) {
+ DigestingParser.Digester[] digesters = new DigestingParser.Digester[algorithms.length];
+ int i = 0;
+ for (DigestAlgorithm algorithm : algorithms) {
+ digesters[i++] = new InputStreamDigester(markLimit, algorithm.getJavaName(), algorithm.name(),
+ new HexEncoder());
}
- metadata.set(algorithm.getMetadataKey(), digest);
- return true;
+ return digesters;
}
/**
- * @param s comma-delimited (no space) list of algorithms to use: md5,sha256
+ * This returns digest algorithms only. It does not understand the encoding
+ * syntax, e.g. "MD5:32" (base 32 encoding of MD5). To parse
+ * those, see {@link #CommonsDigester(int, String)}.
+ *
+ * @deprecated use the {@link #CommonsDigester(int, String)} instead
+ * @param s comma-delimited (no space) list of algorithms to use: md5,sha256.
* @return
+ *
*/
+ @Deprecated
public static DigestAlgorithm[] parse(String s) {
assert (s != null);
List<DigestAlgorithm> ret = new ArrayList<>();
for (String algoString : s.split(",")) {
- String uc = algoString.toUpperCase(Locale.ROOT);
- if (uc.equals(DigestAlgorithm.MD2.toString())) {
- ret.add(DigestAlgorithm.MD2);
- } else if (uc.equals(DigestAlgorithm.MD5.toString())) {
- ret.add(DigestAlgorithm.MD5);
- } else if (uc.equals(DigestAlgorithm.SHA1.toString())) {
- ret.add(DigestAlgorithm.SHA1);
- } else if (uc.equals(DigestAlgorithm.SHA256.toString())) {
- ret.add(DigestAlgorithm.SHA256);
- } else if (uc.equals(DigestAlgorithm.SHA384.toString())) {
- ret.add(DigestAlgorithm.SHA384);
- } else if (uc.equals(DigestAlgorithm.SHA512.toString())) {
- ret.add(DigestAlgorithm.SHA512);
- } else {
- StringBuilder sb = new StringBuilder();
- int i = 0;
- for (DigestAlgorithm algo : DigestAlgorithm.values()) {
- if (i++ > 0) {
- sb.append(", ");
- }
- sb.append(algo.toString());
- }
- throw new IllegalArgumentException("Couldn't match " + s + " with any of: " + sb.toString());
- }
+ ret.add(getDigestAlgorithm(algoString));
}
return ret.toArray(new DigestAlgorithm[ret.size()]);
}
- /**
- * Very slight modification of Commons' BoundedInputStream
- * so that we can figure out if this hit the bound or not.
- */
- private class SimpleBoundedInputStream extends InputStream {
- private final static int EOF = -1;
- private final long max;
- private final InputStream in;
- private long pos;
-
- private SimpleBoundedInputStream(long max, InputStream in) {
- this.max = max;
- this.in = in;
- }
-
- @Override
- public int read() throws IOException {
- if (max >= 0 && pos >= max) {
- return EOF;
+ private static DigestAlgorithm getDigestAlgorithm(String algoString) {
+ String uc = algoString.toUpperCase(Locale.ROOT);
+ if (uc.equals(DigestAlgorithm.MD2.toString())) {
+ return DigestAlgorithm.MD2;
+ } else if (uc.equals(DigestAlgorithm.MD5.toString())) {
+ return DigestAlgorithm.MD5;
+ } else if (uc.equals(DigestAlgorithm.SHA1.toString())) {
+ return DigestAlgorithm.SHA1;
+ } else if (uc.equals(DigestAlgorithm.SHA256.toString())) {
+ return DigestAlgorithm.SHA256;
+ } else if (uc.equals(DigestAlgorithm.SHA384.toString())) {
+ return DigestAlgorithm.SHA384;
+ } else if (uc.equals(DigestAlgorithm.SHA512.toString())) {
+ return DigestAlgorithm.SHA512;
+ } else {
+ StringBuilder sb = new StringBuilder();
+ int i = 0;
+ for (DigestAlgorithm algo : DigestAlgorithm.values()) {
+ if (i++ > 0) {
+ sb.append(", ");
+ }
+ sb.append(algo.toString());
}
- final int result = in.read();
- pos++;
- return result;
+ throw new IllegalArgumentException("Couldn't match " + algoString + " with any of: " + sb.toString());
}
+ }
- /**
- * Invokes the delegate's <code>read(byte[])</code> method.
- *
- * @param b the buffer to read the bytes into
- * @return the number of bytes read or -1 if the end of stream or
- * the limit has been reached.
- * @throws IOException if an I/O error occurs
- */
- @Override
- public int read(final byte[] b) throws IOException {
- return this.read(b, 0, b.length);
- }
-
- /**
- * Invokes the delegate's <code>read(byte[], int, int)</code> method.
- *
- * @param b the buffer to read the bytes into
- * @param off The start offset
- * @param len The number of bytes to read
- * @return the number of bytes read or -1 if the end of stream or
- * the limit has been reached.
- * @throws IOException if an I/O error occurs
- */
- @Override
- public int read(final byte[] b, final int off, final int len) throws IOException {
- if (max >= 0 && pos >= max) {
- return EOF;
- }
- final long maxRead = max >= 0 ? Math.min(len, max - pos) : len;
- final int bytesRead = in.read(b, off, (int) maxRead);
-
- if (bytesRead == EOF) {
- return EOF;
+ private static DigestingParser.Digester[] buildDigesters(int markLimit, String digesterDef) {
+ String[] digests = digesterDef.split(",");
+ DigestingParser.Digester[] digesters = new DigestingParser.Digester[digests.length];
+ int i = 0;
+ for (String digest : digests) {
+ String[] parts = digest.split(":");
+ DigestingParser.Encoder encoder = null;
+ if (parts.length > 1) {
+ if (parts[1].equals("16")) {
+ encoder = new HexEncoder();
+ } else if (parts[1].equals("32")) {
+ encoder = new Base32Encoder();
+ } else {
+ throw new IllegalArgumentException("Value must be '16' or '32'");
+ }
+ } else {
+ encoder = new HexEncoder();
}
-
- pos += bytesRead;
- return bytesRead;
+ DigestAlgorithm digestAlgorithm = getDigestAlgorithm(parts[0]);
+ digesters[i++] = new InputStreamDigester(markLimit, digestAlgorithm.getJavaName(),
+ digestAlgorithm.name(), encoder);
}
+ return digesters;
+ }
- /**
- * Invokes the delegate's <code>skip(long)</code> method.
- *
- * @param n the number of bytes to skip
- * @return the actual number of bytes skipped
- * @throws IOException if an I/O error occurs
- */
- @Override
- public long skip(final long n) throws IOException {
- final long toSkip = max >= 0 ? Math.min(n, max - pos) : n;
- final long skippedBytes = in.skip(toSkip);
- pos += skippedBytes;
- return skippedBytes;
- }
+ private static class HexEncoder implements DigestingParser.Encoder {
@Override
- public void reset() throws IOException {
- in.reset();
- pos = 0;
+ public String encode(byte[] bytes) {
+ return Hex.encodeHexString(bytes);
}
+ }
+ private static class Base32Encoder implements DigestingParser.Encoder {
@Override
- public void mark(int readLimit) {
- in.mark(readLimit);
- }
-
- public boolean hasHitBound() {
- return pos >= max;
+ public String encode(byte[] bytes) {
+ return new Base32().encodeToString(bytes);
}
}
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
similarity index 52%
copy from tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
copy to tika-parsers/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
index 8b198a3..1d2861b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/BouncyCastleDigestingParserTest.java
@@ -19,7 +19,6 @@ package org.apache.tika.parser;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
@@ -34,15 +33,16 @@ import java.util.Map;
import java.util.Random;
import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.lang.StringUtils;
import org.apache.tika.TikaTest;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.utils.CommonsDigester;
+import org.apache.tika.parser.utils.BouncyCastleDigester;
import org.junit.Test;
-public class DigestingParserTest extends TikaTest {
+public class BouncyCastleDigestingParserTest extends TikaTest {
private final static String P = TikaCoreProperties.TIKA_META_PREFIX+
"digest"+Metadata.NAMESPACE_PREFIX_DELIMITER;
@@ -56,46 +56,69 @@ public class DigestingParserTest extends TikaTest {
@Test
public void testBasic() throws Exception {
- Map<CommonsDigester.DigestAlgorithm, String> expected =
+ Map<String, String> expected =
new HashMap<>();
- expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f");
- expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772");
- expected.put(CommonsDigester.DigestAlgorithm.SHA1,"7a1f001d163ac90d8ea54c050faf5a38079788a6");
- expected.put(CommonsDigester.DigestAlgorithm.SHA256,"c4b7fab030a8b6a9d6691f6699ac8e6f" +
- "82bc53764a0f1430d134ae3b70c32654");
- expected.put(CommonsDigester.DigestAlgorithm.SHA384,"ebe368b9326fef44408290724d187553"+
- "8b8a6923fdf251ddab72c6e4b5d54160" +
- "9db917ba4260d1767995a844d8d654df");
- expected.put(CommonsDigester.DigestAlgorithm.SHA512,"ee46d973ee1852c018580c242955974d"+
- "da4c21f36b54d7acd06fcf68e974663b"+
- "fed1d256875be58d22beacf178154cc3"+
- "a1178cb73443deaa53aa0840324708bb");
+ expected.put("MD2", "d768c8e27b0b52c6eaabfaa7122d1d4f");
+ expected.put("MD5", "59f626e09a8c16ab6dbc2800c685f772");
+ expected.put("SHA1", "7a1f001d163ac90d8ea54c050faf5a38079788a6");
+ expected.put("SHA256", "c4b7fab030a8b6a9d6691f6699ac8e6f" +
+ "82bc53764a0f1430d134ae3b70c32654");
+ expected.put("SHA384", "ebe368b9326fef44408290724d187553" +
+ "8b8a6923fdf251ddab72c6e4b5d54160" +
+ "9db917ba4260d1767995a844d8d654df");
+ expected.put("SHA512", "ee46d973ee1852c018580c242955974d" +
+ "da4c21f36b54d7acd06fcf68e974663b" +
+ "fed1d256875be58d22beacf178154cc3" +
+ "a1178cb73443deaa53aa0840324708bb");
//test each one
- for (CommonsDigester.DigestAlgorithm algo : CommonsDigester.DigestAlgorithm.values()) {
+ for (String algo : expected.keySet()) {
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(UNLIMITED, algo)), m);
- assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
+ new DigestingParser(p, new BouncyCastleDigester(UNLIMITED, algo)), m);
+ assertEquals(algo, expected.get(algo), m.get(P + algo));
}
+ }
+
+ @Test
+ public void testCommaSeparated() throws Exception {
+ Map<String, String> expected =
+ new HashMap<>();
+ expected.put("MD2", "d768c8e27b0b52c6eaabfaa7122d1d4f");
+ expected.put("MD5", "59f626e09a8c16ab6dbc2800c685f772");
+ expected.put("SHA1", "7a1f001d163ac90d8ea54c050faf5a38079788a6");
+ expected.put("SHA256", "c4b7fab030a8b6a9d6691f6699ac8e6f" +
+ "82bc53764a0f1430d134ae3b70c32654");
+ expected.put("SHA384", "ebe368b9326fef44408290724d187553" +
+ "8b8a6923fdf251ddab72c6e4b5d54160" +
+ "9db917ba4260d1767995a844d8d654df");
+ expected.put("SHA512",
+ "ee46d973ee1852c018580c242955974d" +
+ "da4c21f36b54d7acd06fcf68e974663b" +
+ "fed1d256875be58d22beacf178154cc3" +
+ "a1178cb73443deaa53aa0840324708bb");
+ expected.put("SHA3-512",
+ "04337f667a250348a1acb992863b3ddc"+
+ "eab38365c206c18d356d2b31675ad669"+
+ "5fb5497f4e79b11640aefbb8042a5dbb"+
+ "7ec6c2c6c1b6e19210453591c52cb6eb");
+ expected.put("SHA1", "PIPQAHIWHLEQ3DVFJQCQ7L22HADZPCFG");
//test comma separated
- CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse("md5,sha256,sha384,sha512");
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), m);
- for (CommonsDigester.DigestAlgorithm algo : new CommonsDigester.DigestAlgorithm[]{
- CommonsDigester.DigestAlgorithm.MD5,
- CommonsDigester.DigestAlgorithm.SHA256,
- CommonsDigester.DigestAlgorithm.SHA384,
- CommonsDigester.DigestAlgorithm.SHA512}) {
- assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
+ new DigestingParser(p, new BouncyCastleDigester(UNLIMITED,
+ "MD5,SHA256,SHA384,SHA512,SHA3-512,SHA1:32")), m);
+ for (String algo : new String[]{
+ "MD5", "SHA256", "SHA384", "SHA512", "SHA3-512",
+ "SHA1"
+ }) {
+ assertEquals(algo, expected.get(algo), m.get(P + algo));
}
- assertNull(m.get(P+CommonsDigester.DigestAlgorithm.MD2.toString()));
- assertNull(m.get(P+CommonsDigester.DigestAlgorithm.SHA1.toString()));
+ assertNull(m.get(P+"MD2"));
}
@@ -104,21 +127,23 @@ public class DigestingParserTest extends TikaTest {
String expectedMD5 = "59f626e09a8c16ab6dbc2800c685f772";
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(100, CommonsDigester.DigestAlgorithm.MD5)), m);
+ new DigestingParser(p, new BouncyCastleDigester(100, "MD5")), m);
assertEquals(expectedMD5, m.get(P+"MD5"));
}
- @Test
+ @Test(expected = IllegalArgumentException.class)
public void testNegativeMaxMarkLength() throws Exception {
- Metadata m = new Metadata();
- boolean ex = false;
- try {
- XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(-1, CommonsDigester.DigestAlgorithm.MD5)), m);
- } catch (IllegalArgumentException e) {
- ex = true;
- }
- assertTrue("Exception not thrown", ex);
+ getXML("test_recursive_embedded.docx",
+ new DigestingParser(p,
+ new BouncyCastleDigester(-1, "MD5")));
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void testUnrecognizedEncodingOptions() throws Exception {
+ getXML("test_recursive_embedded.docx",
+ new DigestingParser(p,
+ new BouncyCastleDigester(100000,
+ "MD5:33")));
}
@Test
@@ -169,85 +194,74 @@ public class DigestingParserTest extends TikaTest {
os.close();
Metadata truth = new Metadata();
- addTruth(tmp, CommonsDigester.DigestAlgorithm.MD5, truth);
- addTruth(tmp, CommonsDigester.DigestAlgorithm.SHA1, truth);
- addTruth(tmp, CommonsDigester.DigestAlgorithm.SHA512, truth);
+ addTruth(tmp, "MD5", truth);
+ addTruth(tmp, "SHA1", truth);
+ addTruth(tmp, "SHA512", truth);
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
- CommonsDigester.DigestAlgorithm.SHA512,
- CommonsDigester.DigestAlgorithm.SHA1,
- CommonsDigester.DigestAlgorithm.MD5);
-
+ "SHA512",
+ "SHA1", "MD5");
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
- CommonsDigester.DigestAlgorithm.MD5,
- CommonsDigester.DigestAlgorithm.SHA1);
+ "MD5", "SHA1");
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
- CommonsDigester.DigestAlgorithm.SHA1,
- CommonsDigester.DigestAlgorithm.SHA512,
- CommonsDigester.DigestAlgorithm.MD5);
-
+ "SHA1", "SHA512", "MD5");
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
- CommonsDigester.DigestAlgorithm.SHA1);
+ "SHA1");
checkMulti(truth, tmp, fileLength, markLimit, useTikaInputStream,
- CommonsDigester.DigestAlgorithm.MD5);
+ "MD5");
}
private void checkMulti(Metadata truth, Path tmp,
int fileLength, int markLimit,
- boolean useTikaInputStream, CommonsDigester.DigestAlgorithm... algos) throws IOException {
+ boolean useTikaInputStream,
+ String... algos) throws IOException {
Metadata result = new Metadata();
- CommonsDigester digester = new CommonsDigester(markLimit, algos);
+ BouncyCastleDigester digester = new BouncyCastleDigester(markLimit,
+ StringUtils.join(algos, ","));
try (InputStream is = useTikaInputStream ? TikaInputStream.get(tmp) :
new BufferedInputStream(Files.newInputStream(tmp))) {
digester.digest(is, result, new ParseContext());
}
- for (CommonsDigester.DigestAlgorithm algo : algos) {
- String truthValue = truth.get(P+algo.name());
- String resultValue = result.get(P+algo.name());
+ for (String algo : algos) {
+ String truthValue = truth.get(P+algo);
+ String resultValue = result.get(P+algo);
assertNotNull("truth", truthValue);
assertNotNull("result (fileLength="+fileLength+", markLimit="+markLimit+")",
resultValue);
-
assertEquals("fileLength("+fileLength+") markLimit("+
- markLimit+") useTikaInputStream("+useTikaInputStream+")"+
- "algorithm("+algo.name()+") seed("+SEED+")",
+ markLimit+") useTikaInputStream("+useTikaInputStream+") "+
+ "algorithm("+algo+") seed("+SEED+")",
truthValue, resultValue);
}
}
- private void addTruth(Path tmp, CommonsDigester.DigestAlgorithm algo, Metadata truth) throws IOException {
+ private void addTruth(Path tmp, String algo, Metadata truth) throws IOException {
String digest = null;
+ //for now, rely on CommonsDigest for truth
try (InputStream is = Files.newInputStream(tmp)) {
- switch (algo) {
- case MD2:
- digest = DigestUtils.md2Hex(is);
- break;
- case MD5:
- digest = DigestUtils.md5Hex(is);
- break;
- case SHA1:
- digest = DigestUtils.sha1Hex(is);
- break;
- case SHA256:
- digest = DigestUtils.sha256Hex(is);
- break;
- case SHA384:
- digest = DigestUtils.sha384Hex(is);
- break;
- case SHA512:
- digest = DigestUtils.sha512Hex(is);
- break;
- default:
- throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algo.toString());
+ if ("MD2".equals(algo)) {
+ digest = DigestUtils.md2Hex(is);
+ } else if ("MD5".equals(algo)) {
+ digest = DigestUtils.md5Hex(is);
+ } else if ("SHA1".equals(algo)) {
+ digest = DigestUtils.sha1Hex(is);
+ } else if ("SHA256".equals(algo)) {
+ digest = DigestUtils.sha256Hex(is);
+ } else if ("SHA384".equals(algo)) {
+ digest = DigestUtils.sha384Hex(is);
+ } else if ("SHA512".equals(algo)) {
+ digest = DigestUtils.sha512Hex(is);
+ } else {
+ throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algo);
}
}
- truth.set(P+algo.name(), digest);
+ truth.set(P+algo, digest);
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
index 8b198a3..931718e 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -59,18 +59,18 @@ public class DigestingParserTest extends TikaTest {
Map<CommonsDigester.DigestAlgorithm, String> expected =
new HashMap<>();
- expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f");
- expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772");
- expected.put(CommonsDigester.DigestAlgorithm.SHA1,"7a1f001d163ac90d8ea54c050faf5a38079788a6");
- expected.put(CommonsDigester.DigestAlgorithm.SHA256,"c4b7fab030a8b6a9d6691f6699ac8e6f" +
- "82bc53764a0f1430d134ae3b70c32654");
- expected.put(CommonsDigester.DigestAlgorithm.SHA384,"ebe368b9326fef44408290724d187553"+
- "8b8a6923fdf251ddab72c6e4b5d54160" +
- "9db917ba4260d1767995a844d8d654df");
- expected.put(CommonsDigester.DigestAlgorithm.SHA512,"ee46d973ee1852c018580c242955974d"+
- "da4c21f36b54d7acd06fcf68e974663b"+
- "fed1d256875be58d22beacf178154cc3"+
- "a1178cb73443deaa53aa0840324708bb");
+ expected.put(CommonsDigester.DigestAlgorithm.MD2, "d768c8e27b0b52c6eaabfaa7122d1d4f");
+ expected.put(CommonsDigester.DigestAlgorithm.MD5, "59f626e09a8c16ab6dbc2800c685f772");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA1, "7a1f001d163ac90d8ea54c050faf5a38079788a6");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA256, "c4b7fab030a8b6a9d6691f6699ac8e6f" +
+ "82bc53764a0f1430d134ae3b70c32654");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA384, "ebe368b9326fef44408290724d187553" +
+ "8b8a6923fdf251ddab72c6e4b5d54160" +
+ "9db917ba4260d1767995a844d8d654df");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA512, "ee46d973ee1852c018580c242955974d" +
+ "da4c21f36b54d7acd06fcf68e974663b" +
+ "fed1d256875be58d22beacf178154cc3" +
+ "a1178cb73443deaa53aa0840324708bb");
//test each one
for (CommonsDigester.DigestAlgorithm algo : CommonsDigester.DigestAlgorithm.values()) {
@@ -80,14 +80,35 @@ public class DigestingParserTest extends TikaTest {
assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
}
+ }
+
+ @Test
+ public void testCommaSeparated() throws Exception {
+ Map<CommonsDigester.DigestAlgorithm, String> expected =
+ new HashMap<>();
+
+
+ expected.put(CommonsDigester.DigestAlgorithm.MD2, "d768c8e27b0b52c6eaabfaa7122d1d4f");
+ expected.put(CommonsDigester.DigestAlgorithm.MD5, "59f626e09a8c16ab6dbc2800c685f772");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA1, "PIPQAHIWHLEQ3DVFJQCQ7L22HADZPCFG");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA256, "c4b7fab030a8b6a9d6691f6699ac8e6f" +
+ "82bc53764a0f1430d134ae3b70c32654");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA384, "ebe368b9326fef44408290724d187553" +
+ "8b8a6923fdf251ddab72c6e4b5d54160" +
+ "9db917ba4260d1767995a844d8d654df");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA512, "ee46d973ee1852c018580c242955974d" +
+ "da4c21f36b54d7acd06fcf68e974663b" +
+ "fed1d256875be58d22beacf178154cc3" +
+ "a1178cb73443deaa53aa0840324708bb");
//test comma separated
- CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse("md5,sha256,sha384,sha512");
Metadata m = new Metadata();
XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), m);
+ new DigestingParser(p, new CommonsDigester(UNLIMITED,
+ "md5,sha256,sha384,sha512,sha1:32")), m);
for (CommonsDigester.DigestAlgorithm algo : new CommonsDigester.DigestAlgorithm[]{
CommonsDigester.DigestAlgorithm.MD5,
+ CommonsDigester.DigestAlgorithm.SHA1,
CommonsDigester.DigestAlgorithm.SHA256,
CommonsDigester.DigestAlgorithm.SHA384,
CommonsDigester.DigestAlgorithm.SHA512}) {
@@ -95,8 +116,6 @@ public class DigestingParserTest extends TikaTest {
}
assertNull(m.get(P+CommonsDigester.DigestAlgorithm.MD2.toString()));
- assertNull(m.get(P+CommonsDigester.DigestAlgorithm.SHA1.toString()));
-
}
@Test
@@ -212,9 +231,8 @@ public class DigestingParserTest extends TikaTest {
assertNotNull("truth", truthValue);
assertNotNull("result (fileLength="+fileLength+", markLimit="+markLimit+")",
resultValue);
-
assertEquals("fileLength("+fileLength+") markLimit("+
- markLimit+") useTikaInputStream("+useTikaInputStream+")"+
+ markLimit+") useTikaInputStream("+useTikaInputStream+") "+
"algorithm("+algo.name()+") seed("+SEED+")",
truthValue, resultValue);
}
diff --git a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
index b5d94d9..03d582e 100644
--- a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
+++ b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
@@ -37,6 +37,7 @@ import org.apache.cxf.rs.security.cors.CrossOriginResourceSharingFilter;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.utils.BouncyCastleDigester;
import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.server.resource.DetectorResource;
import org.apache.tika.server.resource.LanguageResource;
@@ -80,7 +81,7 @@ public class TikaServerCli {
options.addOption("h", "host", true, "host name (default = " + DEFAULT_HOST + ", use * for all)");
options.addOption("p", "port", true, "listen port (default = " + DEFAULT_PORT + ')');
options.addOption("c", "config", true, "Tika Configuration file to override default config with.");
- options.addOption("d", "digest", true, "include digest in metadata, e.g. md5,sha256");
+ options.addOption("d", "digest", true, "include digest in metadata, e.g. md5,sha1:32,sha256");
options.addOption("dml", "digestMarkLimit", true, "max number of bytes to mark on stream for digest");
options.addOption("l", "log", true, "request URI log level ('debug' or 'info')");
options.addOption("s", "includeStack", false, "whether or not to return a stack trace\nif there is an exception during 'parse'");
@@ -168,8 +169,16 @@ public class TikaServerCli {
throw new RuntimeException("Must have parseable int after digestMarkLimit(dml): "+dmlS);
}
}
- digester = new CommonsDigester(digestMarkLimit,
- CommonsDigester.parse(line.getOptionValue("digest")));
+ try {
+ digester = new CommonsDigester(digestMarkLimit, line.getOptionValue("digest"));
+ } catch (IllegalArgumentException commonsException) {
+ try {
+ digester = new BouncyCastleDigester(digestMarkLimit, line.getOptionValue("digest"));
+ } catch (IllegalArgumentException bcException) {
+ throw new IllegalArgumentException("Tried both CommonsDigester ("+commonsException.getMessage()+
+ ") and BouncyCastleDigester ("+bcException.getMessage()+")", bcException);
+ }
+ }
}
if (line.hasOption("enableFileUrl") &&
diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
index 2a09968..7b35fec 100644
--- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
+++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
@@ -83,7 +83,7 @@ public abstract class CXFTestBase {
public void setUp() {
this.tika = TikaConfig.getDefaultConfig();
TikaResource.init(tika,
- new CommonsDigester(DIGESTER_READ_LIMIT, CommonsDigester.DigestAlgorithm.MD5),
+ new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"),
new DefaultInputStreamFactory());
JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
setUpResources(sf);
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 50d5356..5d112ff 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -138,6 +138,8 @@ public class TikaResourceTest extends CXFTestBase {
assertTrue(responseMsg.contains("test"));
assertContains("<meta name=\"X-TIKA:digest:MD5\" content=\"f8be45c34e8919eedba48cc8d207fbf0\"/>",
responseMsg);
+ assertContains("<meta name=\"X-TIKA:digest:SHA1\" content=\"N4EBCE7EGTIGZWETEJ6WD3W4KN32TLPG\"/>",
+ responseMsg);
}
@Test
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].