You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by lf...@apache.org on 2020/11/26 19:38:05 UTC
[tika] branch branch_1x updated: TIKA-3237: great optimization in
ForkParser
This is an automated email from the ASF dual-hosted git repository.
lfcnassif pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 0207bd0 TIKA-3237: great optimization in ForkParser
0207bd0 is described below
commit 0207bd0053b6824fda168d8a0a66282b7199951f
Author: Luis Nassif <lf...@gmail.com>
AuthorDate: Thu Nov 26 16:37:28 2020 -0300
TIKA-3237: great optimization in ForkParser
---
CHANGES.txt | 4 ++++
.../org/apache/tika/fork/ContentHandlerProxy.java | 23 +++++++++++++++++-----
.../apache/tika/fork/ContentHandlerResource.java | 16 +++++++++------
.../java/org/apache/tika/fork/ForkParserTest.java | 18 +++++++++++++++--
4 files changed, 48 insertions(+), 13 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index ff2a4ef..c620efc 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,7 @@
+Release 1.26 - xx/xx/xxxx
+
+ * Great optimization in ForkParser (TIKA-3237).
+
Release 1.25 - 11/25/2020
* Fix inconsistent license in xmpcore (TIKA-3204).
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
index 32f7e83..e7d8572 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
@@ -67,7 +67,7 @@ class ContentHandlerProxy implements ContentHandler, ForkProxy {
try {
if (string != null) {
output.writeBoolean(true);
- output.writeUTF(string);
+ writeString(string);
} else {
output.writeBoolean(false);
}
@@ -75,14 +75,27 @@ class ContentHandlerProxy implements ContentHandler, ForkProxy {
throw new SAXException("Unexpected fork proxy problem", e);
}
}
+
+ /**
+ * Breaks the string in 21,845 size chunks to not throw UTFDataFormatException at least in Oracle JDK 8.
+ */
+ private void writeString(String string) throws IOException {
+ int max = 65535 / 3;
+ int frags = (int) Math.ceil((double) string.length() / max);
+ output.writeInt(frags);
+ int i = 0;
+ while (i < frags) {
+ int end = (i < frags - 1) ? (i + 1) * max : string.length();
+ output.writeUTF(string.substring(i * max, end));
+ i++;
+ }
+ }
private void sendCharacters(char[] ch, int start, int length)
throws SAXException {
try {
- output.writeInt(length);
- for (int i = 0; i < length; i++) {
- output.writeChar(ch[start + i]);
- }
+ writeString(new String(ch, start, length));
+
} catch (IOException e) {
throw new SAXException("Unexpected fork proxy problem", e);
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
index d6ef5b4..a43fbea 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
@@ -89,19 +89,23 @@ class ContentHandlerResource implements ForkResource {
private String readString(DataInputStream input) throws IOException {
if (input.readBoolean()) {
- return input.readUTF();
+ return readStringUTF(input);
} else {
return null;
}
}
private char[] readCharacters(DataInputStream input) throws IOException {
- int n = input.readInt();
- char[] ch = new char[n];
- for (int i = 0; i < n; i++) {
- ch[i] = input.readChar();
+ return readStringUTF(input).toCharArray();
+ }
+
+ private String readStringUTF(DataInputStream input) throws IOException {
+ int frags = input.readInt();
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < frags; i++) {
+ sb.append(input.readUTF());
}
- return ch;
+ return sb.toString();
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
index b5ea825..d450825 100644
--- a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
@@ -21,6 +21,8 @@ import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
@@ -41,8 +43,8 @@ import java.util.concurrent.Semaphore;
import org.apache.tika.TikaTest;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.DublinCore;
import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
@@ -55,7 +57,6 @@ import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
-import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -81,6 +82,8 @@ public class ForkParserTest extends TikaTest {
}
}
+
+
@Test
public void testSerialParsing() throws Exception {
ForkParser parser = new ForkParser(
@@ -445,6 +448,17 @@ public class ForkParserTest extends TikaTest {
assertEquals("/embed1.xml", m1.get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
}
+ @Test
+ public void testNoUTFDataFormatException() throws Exception {
+ ContentHandlerProxy proxy = new ContentHandlerProxy(0);
+ DataOutputStream output = new DataOutputStream(new ByteArrayOutputStream());
+ proxy.init(null, output);
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < 65536; i++) {
+ sb.append(1);
+ }
+ proxy.skippedEntity(sb.toString());
+ }
//use this to test that the wrapper handler is acted upon by the server but not proxied back
private static class ToFileHandler extends AbstractRecursiveParserWrapperHandler {