You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/06/02 15:20:44 UTC

[tika] branch master updated: TIKA-2384 - TikaResource can close an InputStream twice; this revealed a bug in CommonsDigester, which this also fixes. (via Haris Osmanagic).

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/master by this push:
       new  6d710da   TIKA-2384 - TikaResource can close an InputStream twice; this revealed a bug in CommonsDigester, which this also fixes. (via Haris Osmanagic).
6d710da is described below

commit 6d710da272285e04d6632b501e0752d020063182
Author: tballison <ta...@mitre.org>
AuthorDate: Fri Jun 2 11:20:28 2017 -0400

    TIKA-2384 - TikaResource can close an InputStream twice;
    this revealed a bug in CommonsDigester, which this also fixes.
    (via Haris Osmanagic).
---
 .../org/apache/tika/parser/DigestingParser.java    |  14 ++-
 .../apache/tika/parser/utils/CommonsDigester.java  | 102 ++++++++++-----------
 .../apache/tika/parser/DigestingParserTest.java    |   3 +-
 .../apache/tika/server/resource/TikaResource.java  |  27 ++++--
 .../org/apache/tika/server/TikaResourceTest.java   |  24 +++++
 5 files changed, 106 insertions(+), 64 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
index 2115001..08b028e 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
@@ -22,6 +22,8 @@ import java.io.IOException;
 import java.io.InputStream;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -68,9 +70,15 @@ public class DigestingParser extends ParserDecorator {
 
     @Override
     public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
-        if (digester != null) {
-            digester.digest(stream, metadata, context);
+        TemporaryResources tmp = new TemporaryResources();
+        TikaInputStream tis = TikaInputStream.get(stream, tmp);
+        try {
+            if (digester != null) {
+                digester.digest(tis, metadata, context);
+            }
+            super.parse(tis, handler, metadata, context);
+        } finally {
+            tmp.dispose();
         }
-        super.parse(stream, handler, metadata, context);
     }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java b/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
index 0d2c5df..846ab72 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
@@ -29,7 +29,6 @@ import java.util.Locale;
 import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.io.IOExceptionWithCause;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
@@ -51,7 +50,6 @@ import org.slf4j.LoggerFactory;
  * <p>
  * If a TikaInputStream is passed in and it has an underlying file that is longer
  * than the {@link #markLimit}, then this digester digests the file directly.
- *
  */
 public class CommonsDigester implements DigestingParser.Digester {
 
@@ -68,8 +66,8 @@ public class CommonsDigester implements DigestingParser.Digester {
         SHA512;
 
         String getMetadataKey() {
-            return TikaCoreProperties.TIKA_META_PREFIX+
-                    "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER+this.toString();
+            return TikaCoreProperties.TIKA_META_PREFIX +
+                    "digest" + Metadata.NAMESPACE_PREFIX_DELIMITER + this.toString();
         }
     }
 
@@ -86,55 +84,55 @@ public class CommonsDigester implements DigestingParser.Digester {
 
     @Override
     public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
-        //if this is already a TikaInputStream, rely on the caller to close
-        //the stream and free the tmp file.
-        TikaInputStream tis = TikaInputStream.cast(is);
 
-        TemporaryResources tmp = null;
-        if (tis == null) {
-            //if this isn't a TikaInputStream, create a new TempResources
-            //and make sure to release it!!!
-            tmp = new TemporaryResources();
-            tis = TikaInputStream.get(new CloseShieldInputStream(is), tmp);
-        }
-        try {
+        TikaInputStream tis = TikaInputStream.cast(is);
+        if (tis != null && tis.hasFile()) {
             long sz = -1;
             if (tis.hasFile()) {
                 sz = tis.getLength();
             }
-            //if the file is definitely a file,
+            //if the inputstream has a file,
             //and its size is greater than its mark limit,
             //just digest the underlying file.
             if (sz > markLimit) {
                 digestFile(tis.getFile(), m);
                 return;
             }
+        }
 
-            //try the usual mark/reset stuff.
-            //however, if you actually hit the bound,
-            //then stop and spool to file via TikaInputStream
-            SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, tis);
-            boolean finishedStream = false;
-            for (DigestAlgorithm algorithm : algorithms) {
-                bis.mark(markLimit + 1);
-                finishedStream = digestEach(algorithm, bis, m);
-                bis.reset();
-                if (!finishedStream) {
-                    break;
-                }
-            }
+        //try the usual mark/reset stuff.
+        //however, if you actually hit the bound,
+        //then stop and spool to file via TikaInputStream
+        SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, is);
+        boolean finishedStream = false;
+        for (DigestAlgorithm algorithm : algorithms) {
+            bis.mark(markLimit + 1);
+            finishedStream = digestEach(algorithm, bis, m);
+            bis.reset();
             if (!finishedStream) {
-                digestFile(tis.getFile(), m);
+                break;
             }
-        } finally {
-            try {
-                if (tmp != null) {
-                    tmp.dispose();
+        }
+        //if the stream wasn't finished -- if the stream was longer than the mark limit --
+        //spool to File and digest that.
+        if (!finishedStream) {
+            if (tis != null) {
+                digestFile(tis.getFile(), m);
+            } else {
+                TemporaryResources tmp = new TemporaryResources();
+                try {
+                    TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp);
+                    digestFile(tmpTikaInputStream.getFile(), m);
+                } finally {
+                    try {
+                        tmp.dispose();
+                    } catch (TikaException e) {
+                        throw new IOExceptionWithCause(e);
+                    }
                 }
-            } catch (TikaException e) {
-                throw new IOExceptionWithCause(e);
             }
         }
+
     }
 
     private void digestFile(File f, Metadata m) throws IOException {
@@ -149,15 +147,14 @@ public class CommonsDigester implements DigestingParser.Digester {
     }
 
     /**
-     *
      * @param algorithm algo to use
-     * @param is input stream to read from
-     * @param metadata metadata for reporting the digest
+     * @param is        input stream to read from
+     * @param metadata  metadata for reporting the digest
      * @return whether or not this finished the input stream
      * @throws IOException
      */
     private boolean digestEach(DigestAlgorithm algorithm,
-                            InputStream is, Metadata metadata) throws IOException {
+                               InputStream is, Metadata metadata) throws IOException {
         String digest = null;
         try {
             switch (algorithm) {
@@ -187,7 +184,7 @@ public class CommonsDigester implements DigestingParser.Digester {
             //swallow, or should we throw this?
         }
         if (is instanceof SimpleBoundedInputStream) {
-            if (((SimpleBoundedInputStream)is).hasHitBound()) {
+            if (((SimpleBoundedInputStream) is).hasHitBound()) {
                 return false;
             }
         }
@@ -196,12 +193,11 @@ public class CommonsDigester implements DigestingParser.Digester {
     }
 
     /**
-     *
      * @param s comma-delimited (no space) list of algorithms to use: md5,sha256
      * @return
      */
     public static DigestAlgorithm[] parse(String s) {
-        assert(s != null);
+        assert (s != null);
 
         List<DigestAlgorithm> ret = new ArrayList<>();
         for (String algoString : s.split(",")) {
@@ -260,6 +256,7 @@ public class CommonsDigester implements DigestingParser.Digester {
 
         /**
          * Invokes the delegate's <code>read(byte[])</code> method.
+         *
          * @param b the buffer to read the bytes into
          * @return the number of bytes read or -1 if the end of stream or
          * the limit has been reached.
@@ -272,7 +269,8 @@ public class CommonsDigester implements DigestingParser.Digester {
 
         /**
          * Invokes the delegate's <code>read(byte[], int, int)</code> method.
-         * @param b the buffer to read the bytes into
+         *
+         * @param b   the buffer to read the bytes into
          * @param off The start offset
          * @param len The number of bytes to read
          * @return the number of bytes read or -1 if the end of stream or
@@ -281,37 +279,39 @@ public class CommonsDigester implements DigestingParser.Digester {
          */
         @Override
         public int read(final byte[] b, final int off, final int len) throws IOException {
-            if (max>=0 && pos>=max) {
+            if (max >= 0 && pos >= max) {
                 return EOF;
             }
-            final long maxRead = max>=0 ? Math.min(len, max-pos) : len;
-            final int bytesRead = in.read(b, off, (int)maxRead);
+            final long maxRead = max >= 0 ? Math.min(len, max - pos) : len;
+            final int bytesRead = in.read(b, off, (int) maxRead);
 
-            if (bytesRead==EOF) {
+            if (bytesRead == EOF) {
                 return EOF;
             }
 
-            pos+=bytesRead;
+            pos += bytesRead;
             return bytesRead;
         }
 
         /**
          * Invokes the delegate's <code>skip(long)</code> method.
+         *
          * @param n the number of bytes to skip
          * @return the actual number of bytes skipped
          * @throws IOException if an I/O error occurs
          */
         @Override
         public long skip(final long n) throws IOException {
-            final long toSkip = max>=0 ? Math.min(n, max-pos) : n;
+            final long toSkip = max >= 0 ? Math.min(n, max - pos) : n;
             final long skippedBytes = in.skip(toSkip);
-            pos+=skippedBytes;
+            pos += skippedBytes;
             return skippedBytes;
         }
 
         @Override
         public void reset() throws IOException {
             in.reset();
+            pos = 0;
         }
 
         @Override
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
index 7986efe..8b198a3 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -210,7 +210,8 @@ public class DigestingParserTest extends TikaTest {
             String truthValue = truth.get(P+algo.name());
             String resultValue = result.get(P+algo.name());
             assertNotNull("truth", truthValue);
-            assertNotNull("result", resultValue);
+            assertNotNull("result (fileLength="+fileLength+", markLimit="+markLimit+")",
+                    resultValue);
 
             assertEquals("fileLength("+fileLength+") markLimit("+
                     markLimit+") useTikaInputStream("+useTikaInputStream+")"+
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index 6d75d38..10412be 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -57,7 +57,6 @@ import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.mime.MediaType;
@@ -303,10 +302,24 @@ public class TikaResource {
 
     }
 
+    /**
+     * Use this to call a parser and unify exception handling.
+     * NOTE: This call to parse closes the InputStream. DO NOT surround
+     * the call in an auto-close block.
+     *
+     * @param parser parser to use
+     * @param logger logger to use
+     * @param path file path
+     * @param inputStream inputStream (which is closed by this call!)
+     * @param handler handler to use
+     * @param metadata metadata
+     * @param parseContext parse context
+     * @throws IOException wrapper for all exceptions
+     */
     public static void parse(Parser parser, Logger logger, String path, InputStream inputStream,
                              ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException {
-        try (TikaInputStream tikaInputStream = TikaInputStream.get(inputStream)) {
-            parser.parse(tikaInputStream, handler, metadata, parseContext);
+        try {
+            parser.parse(inputStream, handler, metadata, parseContext);
         } catch (SAXException e) {
             throw new TikaServerParseException(e);
         } catch (EncryptedDocumentException e) {
@@ -374,9 +387,7 @@ public class TikaResource {
 
                 ContentHandler handler = new BoilerpipeContentHandler(writer);
 
-                try (InputStream inputStream = is) {
-                    parse(parser, LOG, info.getPath(), inputStream, handler, metadata, context);
-                }
+                parse(parser, LOG, info.getPath(), is, handler, metadata, context);
             }
         };
     }
@@ -405,9 +416,7 @@ public class TikaResource {
 
                 BodyContentHandler body = new BodyContentHandler(new RichTextContentHandler(writer));
 
-                try (InputStream inputStream = is) {
-                    parse(parser, LOG, info.getPath(), inputStream, body, metadata, context);
-                }
+                parse(parser, LOG, info.getPath(), is, body, metadata, context);
             }
         };
     }
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 6302d90..50d5356 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -40,6 +40,8 @@ public class TikaResourceTest extends CXFTestBase {
     public static final String TEST_PASSWORD_PROTECTED = "password.xls";
     private static final String TEST_RECURSIVE_DOC = "test_recursive_embedded.docx";
 
+    private static final String STREAM_CLOSED_FAULT = "java.io.IOException: Stream Closed";
+
     private static final String TIKA_PATH = "/tika";
     private static final int UNPROCESSEABLE = 422;
 
@@ -253,4 +255,26 @@ public class TikaResourceTest extends CXFTestBase {
                 .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
         assertEquals(500, response.getStatus());
     }
+
+    @Test
+    public void testExtractTextAcceptPlainText() throws Exception {
+        //TIKA-2384
+        Attachment attachmentPart = new Attachment(
+                "my-docx-file",
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                ClassLoader.getSystemResourceAsStream("2pic.docx")
+        );
+
+        Response response = WebClient.create(endPoint + TIKA_PATH + "/form")
+                .type("multipart/form-data")
+                .accept("text/plain")
+                .post(attachmentPart);
+
+        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
+        assertTrue(responseMsg.contains("P1040893.JPG"));
+        assertNotFound(
+                STREAM_CLOSED_FAULT,
+                responseMsg
+        );
+    }
 }

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].