You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/04/14 21:06:06 UTC
[tika] branch master updated: TIKA-3092 -- fix multithreaded
skipfully in HWP parser
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new ae60b20 TIKA-3092 -- fix multithreaded skipfully in HWP parser
ae60b20 is described below
commit ae60b205b45e89c7cde7792937faa37a03dbf9db
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 14 16:53:27 2020 -0400
TIKA-3092 -- fix multithreaded skipfully in HWP parser
---
.../src/main/java/org/apache/tika/io/IOUtils.java | 20 ++++++++++++++++++++
.../java/org/apache/tika/io/TikaInputStream.java | 9 ++++++++-
.../java/org/apache/tika/MultiThreadedTikaTest.java | 1 -
.../org/apache/tika/parser/hwp/HwpStreamReader.java | 18 ++++++++----------
.../apache/tika/parser/hwp/HwpTextExtractorV5.java | 6 +++---
.../org/apache/tika/parser/hwp/HwpV5ParserTest.java | 19 ++++++++++++++++++-
6 files changed, 57 insertions(+), 16 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/io/IOUtils.java b/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
index 18e86f1..b6c18c1 100644
--- a/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
@@ -1264,4 +1264,24 @@ public class IOUtils {
return toSkip - remain;
}
+ public static long skip(final InputStream input, final long toSkip, byte[] buffer) throws IOException {
+ if (toSkip < 0) {
+ throw new IllegalArgumentException("Skip count must be non-negative, actual: " + toSkip);
+ }
+ /*
+ * N.B. no need to synchronize this because: - we don't care if the buffer is created multiple times (the data
+ * is ignored) - we always use the same size buffer, so if it it is recreated it will still be OK (if the buffer
+ * size were variable, we would need to synch. to ensure some other thread did not create a smaller one)
+ */
+ long remain = toSkip;
+ while (remain > 0) {
+ // See https://issues.apache.org/jira/browse/IO-203 for why we use read() rather than delegating to skip()
+ final long n = input.read(buffer, 0, (int) Math.min(remain, buffer.length));
+ if (n < 0) { // EOF
+ break;
+ }
+ remain -= n;
+ }
+ return toSkip - remain;
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 3997f9e..a0d0526 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -512,6 +512,7 @@ public class TikaInputStream extends TaggedInputStream {
private int consecutiveEOFs = 0;
+ private byte[] skipBuffer;
/**
* Creates a TikaInputStream instance. This private constructor is used
* by the static factory methods based on the available information.
@@ -750,7 +751,13 @@ public class TikaInputStream extends TaggedInputStream {
*/
@Override
public long skip(long ln) throws IOException {
- long n = IOUtils.skip(super.in, ln);
+ //On TIKA-3092, we found that using the static byte array buffer
+ //caused problems with multithreading with the FlateInputStream
+ //from a POIFS document stream
+ if (skipBuffer == null) {
+ skipBuffer = new byte[4096];
+ }
+ long n = IOUtils.skip(super.in, ln, skipBuffer);
if (n != ln) {
throw new IOException("tried to skip "+ln + " but actually skipped: "+n);
}
diff --git a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
index 9450f24..67f1ffd 100644
--- a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
@@ -25,7 +25,6 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
index 978b361..2ba548c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
@@ -24,6 +24,7 @@ import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LittleEndian;
public class HwpStreamReader {
+ private byte[] skipBuffer = new byte[4096];
private InputStream input;
private byte[] buf;
@@ -32,15 +33,6 @@ public class HwpStreamReader {
buf = new byte[4];
}
- /**
- * More data to read ?
- *
- * @return
- * @throws IOException
- */
- public boolean available() throws IOException {
- return input.available() > 0;
- }
/**
* unsigned 1 byte
@@ -129,6 +121,12 @@ public class HwpStreamReader {
* @throws IOException
*/
public void ensureSkip(long n) throws IOException {
- IOUtils.skipFully(input, n);
+ //Leaving this for anyone who can figure out why this doesn't
+ //work. See HwpV5ParserTest#testMultiThreadedSkipFully
+ //long skipped = org.apache.tika.io.IOUtils.skip(input, n);
+ long skipped = org.apache.tika.io.IOUtils.skip(input, n, skipBuffer);
+ if (skipped != n) {
+ throw new EOFException();
+ }
}
}
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
index 8f95831..48b8c02 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
@@ -37,6 +37,7 @@ import java.util.Locale;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
+import org.apache.commons.compress.compressors.deflate.DeflateParameters;
import org.apache.poi.hpsf.NoPropertySetStreamException;
import org.apache.poi.hpsf.Property;
import org.apache.poi.hpsf.PropertySet;
@@ -261,7 +262,6 @@ public class HwpTextExtractorV5 implements Serializable {
if (entry.getName().startsWith("Section")
&& entry instanceof DocumentEntry) {
LOG.debug("extract {}", entry.getName());
-
InputStream input = new DocumentInputStream(
(DocumentEntry) entry);
@@ -384,7 +384,7 @@ public class HwpTextExtractorV5 implements Serializable {
*/
private void parse(HwpStreamReader reader, XHTMLContentHandler xhtml)
throws IOException, SAXException {
- StringBuffer buf = new StringBuffer(1024);
+ StringBuilder buf = new StringBuilder();
TagInfo tag = new TagInfo();
while (true) {
@@ -421,7 +421,7 @@ public class HwpTextExtractorV5 implements Serializable {
* @throws IOException
*/
private void writeParaText(HwpStreamReader reader, long datasize,
- StringBuffer buf) throws IOException {
+ StringBuilder buf) throws IOException {
int[] chars = reader.uint16((int) (datasize / 2));
for (int index = 0; index < chars.length; index++) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
index 76dc470..523f17c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
@@ -18,13 +18,19 @@ package org.apache.tika.parser.hwp;
import static org.junit.Assert.assertEquals;
+import org.apache.commons.io.filefilter.RegexFileFilter;
+import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
import org.junit.Test;
-public class HwpV5ParserTest extends TikaTest {
+import java.nio.file.Paths;
+
+public class HwpV5ParserTest extends MultiThreadedTikaTest {
@Test
public void testHwpV5Parser() throws Exception {
@@ -65,4 +71,15 @@ public class HwpV5ParserTest extends TikaTest {
assertEquals("next1009", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("\uD14C\uC2A4\uD2B8", metadata.get(TikaCoreProperties.TITLE));
}
+
+ @Test
+ public void testMultiThreadedSkipFully() throws Exception {
+ //TIKA-3092
+ int numThreads = 2;
+ int numIterations = 50;
+ ParseContext[] parseContexts = new ParseContext[numThreads];
+
+ testMultiThreaded(new RecursiveParserWrapper(AUTO_DETECT_PARSER), parseContexts, numThreads, numIterations,
+ new RegexFileFilter(".*\\.hwp"));
+ }
}