You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/04/14 21:06:06 UTC

[tika] branch master updated: TIKA-3092 -- fix multithreaded skipfully in HWP parser

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new ae60b20  TIKA-3092 -- fix multithreaded skipfully in HWP parser
ae60b20 is described below

commit ae60b205b45e89c7cde7792937faa37a03dbf9db
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 14 16:53:27 2020 -0400

    TIKA-3092 -- fix multithreaded skipfully in HWP parser
---
 .../src/main/java/org/apache/tika/io/IOUtils.java    | 20 ++++++++++++++++++++
 .../java/org/apache/tika/io/TikaInputStream.java     |  9 ++++++++-
 .../java/org/apache/tika/MultiThreadedTikaTest.java  |  1 -
 .../org/apache/tika/parser/hwp/HwpStreamReader.java  | 18 ++++++++----------
 .../apache/tika/parser/hwp/HwpTextExtractorV5.java   |  6 +++---
 .../org/apache/tika/parser/hwp/HwpV5ParserTest.java  | 19 ++++++++++++++++++-
 6 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/io/IOUtils.java b/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
index 18e86f1..b6c18c1 100644
--- a/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
@@ -1264,4 +1264,24 @@ public class IOUtils {
         return toSkip - remain;
     }
 
+    public static long skip(final InputStream input, final long toSkip, byte[] buffer) throws IOException {
+        if (toSkip < 0) {
+            throw new IllegalArgumentException("Skip count must be non-negative, actual: " + toSkip);
+        }
+        /*
+         * N.B. no need to synchronize this because: - we don't care if the buffer is created multiple times (the data
+         * is ignored) - we always use the same size buffer, so if it it is recreated it will still be OK (if the buffer
+         * size were variable, we would need to synch. to ensure some other thread did not create a smaller one)
+         */
+        long remain = toSkip;
+        while (remain > 0) {
+            // See https://issues.apache.org/jira/browse/IO-203 for why we use read() rather than delegating to skip()
+            final long n = input.read(buffer, 0, (int) Math.min(remain, buffer.length));
+            if (n < 0) { // EOF
+                break;
+            }
+            remain -= n;
+        }
+        return toSkip - remain;
+    }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 3997f9e..a0d0526 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -512,6 +512,7 @@ public class TikaInputStream extends TaggedInputStream {
 
     private int consecutiveEOFs = 0;
 
+    private byte[] skipBuffer;
     /**
      * Creates a TikaInputStream instance. This private constructor is used
      * by the static factory methods based on the available information.
@@ -750,7 +751,13 @@ public class TikaInputStream extends TaggedInputStream {
      */
     @Override
     public long skip(long ln) throws IOException {
-        long n = IOUtils.skip(super.in, ln);
+        //On TIKA-3092, we found that using the static byte array buffer
+        //caused problems with multithreading with the FlateInputStream
+        //from a POIFS document stream
+        if (skipBuffer == null) {
+            skipBuffer = new byte[4096];
+        }
+        long n = IOUtils.skip(super.in, ln, skipBuffer);
         if (n != ln) {
             throw new IOException("tried to skip "+ln + " but actually skipped: "+n);
         }
diff --git a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
index 9450f24..67f1ffd 100644
--- a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
@@ -25,7 +25,6 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.RecursiveParserWrapperHandler;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
index 978b361..2ba548c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
@@ -24,6 +24,7 @@ import org.apache.poi.util.IOUtils;
 import org.apache.poi.util.LittleEndian;
 
 public class HwpStreamReader {
+    private byte[] skipBuffer = new byte[4096];
     private InputStream input;
     private byte[] buf;
 
@@ -32,15 +33,6 @@ public class HwpStreamReader {
         buf = new byte[4];
     }
 
-    /**
-     * More data to read ?
-     *
-     * @return
-     * @throws IOException
-     */
-    public boolean available() throws IOException {
-        return input.available() > 0;
-    }
 
     /**
      * unsigned 1 byte
@@ -129,6 +121,12 @@ public class HwpStreamReader {
      * @throws IOException
      */
     public void ensureSkip(long n) throws IOException {
-        IOUtils.skipFully(input, n);
+        //Leaving this for anyone who can figure out why this doesn't
+        //work.  See HwpV5ParserTest#testMultiThreadedSkipFully
+        //long skipped = org.apache.tika.io.IOUtils.skip(input, n);
+        long skipped = org.apache.tika.io.IOUtils.skip(input, n, skipBuffer);
+        if (skipped != n) {
+            throw new EOFException();
+        }
     }
 }
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
index 8f95831..48b8c02 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
@@ -37,6 +37,7 @@ import java.util.Locale;
 import java.util.zip.Inflater;
 import java.util.zip.InflaterInputStream;
 
+import org.apache.commons.compress.compressors.deflate.DeflateParameters;
 import org.apache.poi.hpsf.NoPropertySetStreamException;
 import org.apache.poi.hpsf.Property;
 import org.apache.poi.hpsf.PropertySet;
@@ -261,7 +262,6 @@ public class HwpTextExtractorV5 implements Serializable {
             if (entry.getName().startsWith("Section")
                     && entry instanceof DocumentEntry) {
                 LOG.debug("extract {}", entry.getName());
-
                 InputStream input = new DocumentInputStream(
                         (DocumentEntry) entry);
 
@@ -384,7 +384,7 @@ public class HwpTextExtractorV5 implements Serializable {
      */
     private void parse(HwpStreamReader reader, XHTMLContentHandler xhtml)
             throws IOException, SAXException {
-        StringBuffer buf = new StringBuffer(1024);
+        StringBuilder buf = new StringBuilder();
         TagInfo tag = new TagInfo();
 
         while (true) {
@@ -421,7 +421,7 @@ public class HwpTextExtractorV5 implements Serializable {
      * @throws IOException
      */
     private void writeParaText(HwpStreamReader reader, long datasize,
-                               StringBuffer buf) throws IOException {
+                               StringBuilder buf) throws IOException {
         int[] chars = reader.uint16((int) (datasize / 2));
 
         for (int index = 0; index < chars.length; index++) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
index 76dc470..523f17c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
@@ -18,13 +18,19 @@ package org.apache.tika.parser.hwp;
 
 import static org.junit.Assert.assertEquals;
 
+import org.apache.commons.io.filefilter.RegexFileFilter;
+import org.apache.tika.MultiThreadedTikaTest;
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.junit.Test;
 
-public class HwpV5ParserTest extends TikaTest {
+import java.nio.file.Paths;
+
+public class HwpV5ParserTest extends MultiThreadedTikaTest {
 
     @Test
     public void testHwpV5Parser() throws Exception {
@@ -65,4 +71,15 @@ public class HwpV5ParserTest extends TikaTest {
         assertEquals("next1009", metadata.get(TikaCoreProperties.CREATOR));
         assertEquals("\uD14C\uC2A4\uD2B8", metadata.get(TikaCoreProperties.TITLE));
     }
+
+    @Test
+    public void testMultiThreadedSkipFully() throws Exception {
+        //TIKA-3092
+        int numThreads = 2;
+        int numIterations = 50;
+        ParseContext[] parseContexts = new ParseContext[numThreads];
+
+        testMultiThreaded(new RecursiveParserWrapper(AUTO_DETECT_PARSER), parseContexts, numThreads, numIterations,
+                new RegexFileFilter(".*\\.hwp"));
+    }
 }