You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/06/02 15:20:44 UTC
[tika] branch master updated: TIKA-2384 - TikaResource can close an
InputStream twice;
this revealed a bug in CommonsDigester, which this also fixes. (via Haris
Osmanagic).
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 6d710da TIKA-2384 - TikaResource can close an InputStream twice; this revealed a bug in CommonsDigester, which this also fixes. (via Haris Osmanagic).
6d710da is described below
commit 6d710da272285e04d6632b501e0752d020063182
Author: tballison <ta...@mitre.org>
AuthorDate: Fri Jun 2 11:20:28 2017 -0400
TIKA-2384 - TikaResource can close an InputStream twice;
this revealed a bug in CommonsDigester, which this also fixes.
(via Haris Osmanagic).
---
.../org/apache/tika/parser/DigestingParser.java | 14 ++-
.../apache/tika/parser/utils/CommonsDigester.java | 102 ++++++++++-----------
.../apache/tika/parser/DigestingParserTest.java | 3 +-
.../apache/tika/server/resource/TikaResource.java | 27 ++++--
.../org/apache/tika/server/TikaResourceTest.java | 24 +++++
5 files changed, 106 insertions(+), 64 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
index 2115001..08b028e 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
@@ -22,6 +22,8 @@ import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -68,9 +70,15 @@ public class DigestingParser extends ParserDecorator {
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
- if (digester != null) {
- digester.digest(stream, metadata, context);
+ TemporaryResources tmp = new TemporaryResources();
+ TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ try {
+ if (digester != null) {
+ digester.digest(tis, metadata, context);
+ }
+ super.parse(tis, handler, metadata, context);
+ } finally {
+ tmp.dispose();
}
- super.parse(stream, handler, metadata, context);
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java b/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
index 0d2c5df..846ab72 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
@@ -29,7 +29,6 @@ import java.util.Locale;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.io.IOExceptionWithCause;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
@@ -51,7 +50,6 @@ import org.slf4j.LoggerFactory;
* <p>
* If a TikaInputStream is passed in and it has an underlying file that is longer
* than the {@link #markLimit}, then this digester digests the file directly.
- *
*/
public class CommonsDigester implements DigestingParser.Digester {
@@ -68,8 +66,8 @@ public class CommonsDigester implements DigestingParser.Digester {
SHA512;
String getMetadataKey() {
- return TikaCoreProperties.TIKA_META_PREFIX+
- "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER+this.toString();
+ return TikaCoreProperties.TIKA_META_PREFIX +
+ "digest" + Metadata.NAMESPACE_PREFIX_DELIMITER + this.toString();
}
}
@@ -86,55 +84,55 @@ public class CommonsDigester implements DigestingParser.Digester {
@Override
public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
- //if this is already a TikaInputStream, rely on the caller to close
- //the stream and free the tmp file.
- TikaInputStream tis = TikaInputStream.cast(is);
- TemporaryResources tmp = null;
- if (tis == null) {
- //if this isn't a TikaInputStream, create a new TempResources
- //and make sure to release it!!!
- tmp = new TemporaryResources();
- tis = TikaInputStream.get(new CloseShieldInputStream(is), tmp);
- }
- try {
+ TikaInputStream tis = TikaInputStream.cast(is);
+ if (tis != null && tis.hasFile()) {
long sz = -1;
if (tis.hasFile()) {
sz = tis.getLength();
}
- //if the file is definitely a file,
+ //if the inputstream has a file,
//and its size is greater than its mark limit,
//just digest the underlying file.
if (sz > markLimit) {
digestFile(tis.getFile(), m);
return;
}
+ }
- //try the usual mark/reset stuff.
- //however, if you actually hit the bound,
- //then stop and spool to file via TikaInputStream
- SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, tis);
- boolean finishedStream = false;
- for (DigestAlgorithm algorithm : algorithms) {
- bis.mark(markLimit + 1);
- finishedStream = digestEach(algorithm, bis, m);
- bis.reset();
- if (!finishedStream) {
- break;
- }
- }
+ //try the usual mark/reset stuff.
+ //however, if you actually hit the bound,
+ //then stop and spool to file via TikaInputStream
+ SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, is);
+ boolean finishedStream = false;
+ for (DigestAlgorithm algorithm : algorithms) {
+ bis.mark(markLimit + 1);
+ finishedStream = digestEach(algorithm, bis, m);
+ bis.reset();
if (!finishedStream) {
- digestFile(tis.getFile(), m);
+ break;
}
- } finally {
- try {
- if (tmp != null) {
- tmp.dispose();
+ }
+ //if the stream wasn't finished -- if the stream was longer than the mark limit --
+ //spool to File and digest that.
+ if (!finishedStream) {
+ if (tis != null) {
+ digestFile(tis.getFile(), m);
+ } else {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp);
+ digestFile(tmpTikaInputStream.getFile(), m);
+ } finally {
+ try {
+ tmp.dispose();
+ } catch (TikaException e) {
+ throw new IOExceptionWithCause(e);
+ }
}
- } catch (TikaException e) {
- throw new IOExceptionWithCause(e);
}
}
+
}
private void digestFile(File f, Metadata m) throws IOException {
@@ -149,15 +147,14 @@ public class CommonsDigester implements DigestingParser.Digester {
}
/**
- *
* @param algorithm algo to use
- * @param is input stream to read from
- * @param metadata metadata for reporting the digest
+ * @param is input stream to read from
+ * @param metadata metadata for reporting the digest
* @return whether or not this finished the input stream
* @throws IOException
*/
private boolean digestEach(DigestAlgorithm algorithm,
- InputStream is, Metadata metadata) throws IOException {
+ InputStream is, Metadata metadata) throws IOException {
String digest = null;
try {
switch (algorithm) {
@@ -187,7 +184,7 @@ public class CommonsDigester implements DigestingParser.Digester {
//swallow, or should we throw this?
}
if (is instanceof SimpleBoundedInputStream) {
- if (((SimpleBoundedInputStream)is).hasHitBound()) {
+ if (((SimpleBoundedInputStream) is).hasHitBound()) {
return false;
}
}
@@ -196,12 +193,11 @@ public class CommonsDigester implements DigestingParser.Digester {
}
/**
- *
* @param s comma-delimited (no space) list of algorithms to use: md5,sha256
* @return
*/
public static DigestAlgorithm[] parse(String s) {
- assert(s != null);
+ assert (s != null);
List<DigestAlgorithm> ret = new ArrayList<>();
for (String algoString : s.split(",")) {
@@ -260,6 +256,7 @@ public class CommonsDigester implements DigestingParser.Digester {
/**
* Invokes the delegate's <code>read(byte[])</code> method.
+ *
* @param b the buffer to read the bytes into
* @return the number of bytes read or -1 if the end of stream or
* the limit has been reached.
@@ -272,7 +269,8 @@ public class CommonsDigester implements DigestingParser.Digester {
/**
* Invokes the delegate's <code>read(byte[], int, int)</code> method.
- * @param b the buffer to read the bytes into
+ *
+ * @param b the buffer to read the bytes into
* @param off The start offset
* @param len The number of bytes to read
* @return the number of bytes read or -1 if the end of stream or
@@ -281,37 +279,39 @@ public class CommonsDigester implements DigestingParser.Digester {
*/
@Override
public int read(final byte[] b, final int off, final int len) throws IOException {
- if (max>=0 && pos>=max) {
+ if (max >= 0 && pos >= max) {
return EOF;
}
- final long maxRead = max>=0 ? Math.min(len, max-pos) : len;
- final int bytesRead = in.read(b, off, (int)maxRead);
+ final long maxRead = max >= 0 ? Math.min(len, max - pos) : len;
+ final int bytesRead = in.read(b, off, (int) maxRead);
- if (bytesRead==EOF) {
+ if (bytesRead == EOF) {
return EOF;
}
- pos+=bytesRead;
+ pos += bytesRead;
return bytesRead;
}
/**
* Invokes the delegate's <code>skip(long)</code> method.
+ *
* @param n the number of bytes to skip
* @return the actual number of bytes skipped
* @throws IOException if an I/O error occurs
*/
@Override
public long skip(final long n) throws IOException {
- final long toSkip = max>=0 ? Math.min(n, max-pos) : n;
+ final long toSkip = max >= 0 ? Math.min(n, max - pos) : n;
final long skippedBytes = in.skip(toSkip);
- pos+=skippedBytes;
+ pos += skippedBytes;
return skippedBytes;
}
@Override
public void reset() throws IOException {
in.reset();
+ pos = 0;
}
@Override
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
index 7986efe..8b198a3 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -210,7 +210,8 @@ public class DigestingParserTest extends TikaTest {
String truthValue = truth.get(P+algo.name());
String resultValue = result.get(P+algo.name());
assertNotNull("truth", truthValue);
- assertNotNull("result", resultValue);
+ assertNotNull("result (fileLength="+fileLength+", markLimit="+markLimit+")",
+ resultValue);
assertEquals("fileLength("+fileLength+") markLimit("+
markLimit+") useTikaInputStream("+useTikaInputStream+")"+
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index 6d75d38..10412be 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -57,7 +57,6 @@ import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
@@ -303,10 +302,24 @@ public class TikaResource {
}
+ /**
+ * Use this to call a parser and unify exception handling.
+ * NOTE: This call to parse closes the InputStream. DO NOT surround
+ * the call in an auto-close block.
+ *
+ * @param parser parser to use
+ * @param logger logger to use
+ * @param path file path
+ * @param inputStream inputStream (which is closed by this call!)
+ * @param handler handler to use
+ * @param metadata metadata
+ * @param parseContext parse context
+ * @throws IOException wrapper for all exceptions
+ */
public static void parse(Parser parser, Logger logger, String path, InputStream inputStream,
ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException {
- try (TikaInputStream tikaInputStream = TikaInputStream.get(inputStream)) {
- parser.parse(tikaInputStream, handler, metadata, parseContext);
+ try {
+ parser.parse(inputStream, handler, metadata, parseContext);
} catch (SAXException e) {
throw new TikaServerParseException(e);
} catch (EncryptedDocumentException e) {
@@ -374,9 +387,7 @@ public class TikaResource {
ContentHandler handler = new BoilerpipeContentHandler(writer);
- try (InputStream inputStream = is) {
- parse(parser, LOG, info.getPath(), inputStream, handler, metadata, context);
- }
+ parse(parser, LOG, info.getPath(), is, handler, metadata, context);
}
};
}
@@ -405,9 +416,7 @@ public class TikaResource {
BodyContentHandler body = new BodyContentHandler(new RichTextContentHandler(writer));
- try (InputStream inputStream = is) {
- parse(parser, LOG, info.getPath(), inputStream, body, metadata, context);
- }
+ parse(parser, LOG, info.getPath(), is, body, metadata, context);
}
};
}
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 6302d90..50d5356 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -40,6 +40,8 @@ public class TikaResourceTest extends CXFTestBase {
public static final String TEST_PASSWORD_PROTECTED = "password.xls";
private static final String TEST_RECURSIVE_DOC = "test_recursive_embedded.docx";
+ private static final String STREAM_CLOSED_FAULT = "java.io.IOException: Stream Closed";
+
private static final String TIKA_PATH = "/tika";
private static final int UNPROCESSEABLE = 422;
@@ -253,4 +255,26 @@ public class TikaResourceTest extends CXFTestBase {
.put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
assertEquals(500, response.getStatus());
}
+
+ @Test
+ public void testExtractTextAcceptPlainText() throws Exception {
+ //TIKA-2384
+ Attachment attachmentPart = new Attachment(
+ "my-docx-file",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ClassLoader.getSystemResourceAsStream("2pic.docx")
+ );
+
+ Response response = WebClient.create(endPoint + TIKA_PATH + "/form")
+ .type("multipart/form-data")
+ .accept("text/plain")
+ .post(attachmentPart);
+
+ String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
+ assertTrue(responseMsg.contains("P1040893.JPG"));
+ assertNotFound(
+ STREAM_CLOSED_FAULT,
+ responseMsg
+ );
+ }
}
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].