You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by lf...@apache.org on 2020/11/26 20:22:42 UTC
[tika] branch main updated: TIKA-3237: great optimization in
ForkParser
This is an automated email from the ASF dual-hosted git repository.
lfcnassif pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new d8b0d1f TIKA-3237: great optimization in ForkParser
d8b0d1f is described below
commit d8b0d1fe33942ef49dd34aa58c14b73e27daeadd
Author: Luis Nassif <lf...@gmail.com>
AuthorDate: Thu Nov 26 16:37:28 2020 -0300
TIKA-3237: great optimization in ForkParser
---
CHANGES.txt | 6 +++++-
.../org/apache/tika/fork/ContentHandlerProxy.java | 23 +++++++++++++++++-----
.../apache/tika/fork/ContentHandlerResource.java | 16 +++++++++------
.../java/org/apache/tika/fork/ForkParserTest.java | 14 +++++++++++++
4 files changed, 47 insertions(+), 12 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 4f89a05..6591f41 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -12,8 +12,12 @@ Release 2.0.0 - ???
* General code cleanup (PeterAlfredLee)
Other changes
+
+Release 1.26 - ???
+
+ * Great optimization in ForkParser (TIKA-3237).
-Release 1.25 - 11/23/2020
+Release 1.25 - 11/25/2020
* Fix inconsistent license in xmpcore (TIKA-3204).
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
index 32f7e83..e7d8572 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
@@ -67,7 +67,7 @@ class ContentHandlerProxy implements ContentHandler, ForkProxy {
try {
if (string != null) {
output.writeBoolean(true);
- output.writeUTF(string);
+ writeString(string);
} else {
output.writeBoolean(false);
}
@@ -75,14 +75,27 @@ class ContentHandlerProxy implements ContentHandler, ForkProxy {
throw new SAXException("Unexpected fork proxy problem", e);
}
}
+
+ /**
+ * Breaks the string in 21,845 size chunks to not throw UTFDataFormatException at least in Oracle JDK 8.
+ */
+ private void writeString(String string) throws IOException {
+ int max = 65535 / 3;
+ int frags = (int) Math.ceil((double) string.length() / max);
+ output.writeInt(frags);
+ int i = 0;
+ while (i < frags) {
+ int end = (i < frags - 1) ? (i + 1) * max : string.length();
+ output.writeUTF(string.substring(i * max, end));
+ i++;
+ }
+ }
private void sendCharacters(char[] ch, int start, int length)
throws SAXException {
try {
- output.writeInt(length);
- for (int i = 0; i < length; i++) {
- output.writeChar(ch[start + i]);
- }
+ writeString(new String(ch, start, length));
+
} catch (IOException e) {
throw new SAXException("Unexpected fork proxy problem", e);
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
index d6ef5b4..a43fbea 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
@@ -89,19 +89,23 @@ class ContentHandlerResource implements ForkResource {
private String readString(DataInputStream input) throws IOException {
if (input.readBoolean()) {
- return input.readUTF();
+ return readStringUTF(input);
} else {
return null;
}
}
private char[] readCharacters(DataInputStream input) throws IOException {
- int n = input.readInt();
- char[] ch = new char[n];
- for (int i = 0; i < n; i++) {
- ch[i] = input.readChar();
+ return readStringUTF(input).toCharArray();
+ }
+
+ private String readStringUTF(DataInputStream input) throws IOException {
+ int frags = input.readInt();
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < frags; i++) {
+ sb.append(input.readUTF());
}
- return ch;
+ return sb.toString();
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
index e12e69b..659df76 100644
--- a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
@@ -21,6 +21,8 @@ import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
@@ -413,6 +415,18 @@ public class ForkParserTest extends TikaTest {
assertEquals("/embed1.xml", m1.get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
}
+ @Test
+ public void testNoUTFDataFormatException() throws Exception {
+ ContentHandlerProxy proxy = new ContentHandlerProxy(0);
+ DataOutputStream output = new DataOutputStream(new ByteArrayOutputStream());
+ proxy.init(null, output);
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < 65536; i++) {
+ sb.append(1);
+ }
+ proxy.skippedEntity(sb.toString());
+ }
+
//use this to test that the wrapper handler is acted upon by the server but not proxied back
private static class ToFileHandler extends AbstractRecursiveParserWrapperHandler {