You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by lf...@apache.org on 2020/11/26 20:22:42 UTC

[tika] branch main updated: TIKA-3237: great optimization in ForkParser

This is an automated email from the ASF dual-hosted git repository.

lfcnassif pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new d8b0d1f  TIKA-3237: great optimization in ForkParser
d8b0d1f is described below

commit d8b0d1fe33942ef49dd34aa58c14b73e27daeadd
Author: Luis Nassif <lf...@gmail.com>
AuthorDate: Thu Nov 26 16:37:28 2020 -0300

    TIKA-3237: great optimization in ForkParser
---
 CHANGES.txt                                        |  6 +++++-
 .../org/apache/tika/fork/ContentHandlerProxy.java  | 23 +++++++++++++++++-----
 .../apache/tika/fork/ContentHandlerResource.java   | 16 +++++++++------
 .../java/org/apache/tika/fork/ForkParserTest.java  | 14 +++++++++++++
 4 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 4f89a05..6591f41 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -12,8 +12,12 @@ Release 2.0.0 - ???
    * General code cleanup (PeterAlfredLee)
 
    Other changes
+   
+Release 1.26 - ???
+
+   * Great optimization in ForkParser (TIKA-3237).
 
-Release 1.25 - 11/23/2020
+Release 1.25 - 11/25/2020
 
    * Fix inconsistent license in xmpcore (TIKA-3204).
 
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
index 32f7e83..e7d8572 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
@@ -67,7 +67,7 @@ class ContentHandlerProxy implements ContentHandler, ForkProxy {
         try {
             if (string != null) {
                 output.writeBoolean(true);
-                output.writeUTF(string);
+                writeString(string);
             } else {
                 output.writeBoolean(false);
             }
@@ -75,14 +75,27 @@ class ContentHandlerProxy implements ContentHandler, ForkProxy {
             throw new SAXException("Unexpected fork proxy problem", e);
         }
     }
+    
+    /**
+     * Breaks the string in 21,845 size chunks to not throw UTFDataFormatException at least in Oracle JDK 8.
+     */
+    private void writeString(String string) throws IOException {
+        int max = 65535 / 3;
+        int frags = (int) Math.ceil((double) string.length() / max);
+        output.writeInt(frags);
+        int i = 0;
+        while (i < frags) {
+            int end = (i < frags - 1) ? (i + 1) * max : string.length();
+            output.writeUTF(string.substring(i * max, end));
+            i++;
+        }
+    }
 
     private void sendCharacters(char[] ch, int start, int length)
             throws SAXException {
         try {
-            output.writeInt(length);
-            for (int i = 0; i < length; i++) {
-                output.writeChar(ch[start + i]);
-            }
+            writeString(new String(ch, start, length));
+            
         } catch (IOException e) {
             throw new SAXException("Unexpected fork proxy problem", e);
         }
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
index d6ef5b4..a43fbea 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
@@ -89,19 +89,23 @@ class ContentHandlerResource implements ForkResource {
 
     private String readString(DataInputStream input) throws IOException {
         if (input.readBoolean()) {
-            return input.readUTF();
+            return readStringUTF(input);
         } else {
             return null;
         }
     }
 
     private char[] readCharacters(DataInputStream input) throws IOException {
-        int n = input.readInt();
-        char[] ch = new char[n];
-        for (int i = 0; i < n; i++) {
-            ch[i] = input.readChar();
+        return readStringUTF(input).toCharArray();
+    }
+    
+    private String readStringUTF(DataInputStream input) throws IOException {
+        int frags = input.readInt();
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < frags; i++) {
+            sb.append(input.readUTF());
         }
-        return ch;
+        return sb.toString();
     }
 
 }
diff --git a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
index e12e69b..659df76 100644
--- a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
@@ -21,6 +21,8 @@ import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
 import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@@ -413,6 +415,18 @@ public class ForkParserTest extends TikaTest {
         assertEquals("/embed1.xml", m1.get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
     }
 
+    @Test
+    public void testNoUTFDataFormatException() throws Exception {
+        ContentHandlerProxy proxy = new ContentHandlerProxy(0);
+        DataOutputStream output = new DataOutputStream(new ByteArrayOutputStream());
+        proxy.init(null, output);
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < 65536; i++) {
+            sb.append(1);
+        }
+        proxy.skippedEntity(sb.toString());
+    }
+
 
     //use this to test that the wrapper handler is acted upon by the server but not proxied back
     private static class ToFileHandler extends AbstractRecursiveParserWrapperHandler {