You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by lf...@apache.org on 2020/11/26 19:38:05 UTC

[tika] branch branch_1x updated: TIKA-3237: great optimization in ForkParser

This is an automated email from the ASF dual-hosted git repository.

lfcnassif pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 0207bd0  TIKA-3237: great optimization in ForkParser
0207bd0 is described below

commit 0207bd0053b6824fda168d8a0a66282b7199951f
Author: Luis Nassif <lf...@gmail.com>
AuthorDate: Thu Nov 26 16:37:28 2020 -0300

    TIKA-3237: great optimization in ForkParser
---
 CHANGES.txt                                        |  4 ++++
 .../org/apache/tika/fork/ContentHandlerProxy.java  | 23 +++++++++++++++++-----
 .../apache/tika/fork/ContentHandlerResource.java   | 16 +++++++++------
 .../java/org/apache/tika/fork/ForkParserTest.java  | 18 +++++++++++++++--
 4 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index ff2a4ef..c620efc 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,7 @@
+Release 1.26 - xx/xx/xxxx
+
+   * Great optimization in ForkParser (TIKA-3237).
+
 Release 1.25 - 11/25/2020
 
    * Fix inconsistent license in xmpcore (TIKA-3204).
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
index 32f7e83..e7d8572 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
@@ -67,7 +67,7 @@ class ContentHandlerProxy implements ContentHandler, ForkProxy {
         try {
             if (string != null) {
                 output.writeBoolean(true);
-                output.writeUTF(string);
+                writeString(string);
             } else {
                 output.writeBoolean(false);
             }
@@ -75,14 +75,27 @@ class ContentHandlerProxy implements ContentHandler, ForkProxy {
             throw new SAXException("Unexpected fork proxy problem", e);
         }
     }
+    
+    /**
+     * Breaks the string in 21,845 size chunks to not throw UTFDataFormatException at least in Oracle JDK 8.
+     */
+    private void writeString(String string) throws IOException {
+        int max = 65535 / 3;
+        int frags = (int) Math.ceil((double) string.length() / max);
+        output.writeInt(frags);
+        int i = 0;
+        while (i < frags) {
+            int end = (i < frags - 1) ? (i + 1) * max : string.length();
+            output.writeUTF(string.substring(i * max, end));
+            i++;
+        }
+    }
 
     private void sendCharacters(char[] ch, int start, int length)
             throws SAXException {
         try {
-            output.writeInt(length);
-            for (int i = 0; i < length; i++) {
-                output.writeChar(ch[start + i]);
-            }
+            writeString(new String(ch, start, length));
+            
         } catch (IOException e) {
             throw new SAXException("Unexpected fork proxy problem", e);
         }
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
index d6ef5b4..a43fbea 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
@@ -89,19 +89,23 @@ class ContentHandlerResource implements ForkResource {
 
     private String readString(DataInputStream input) throws IOException {
         if (input.readBoolean()) {
-            return input.readUTF();
+            return readStringUTF(input);
         } else {
             return null;
         }
     }
 
     private char[] readCharacters(DataInputStream input) throws IOException {
-        int n = input.readInt();
-        char[] ch = new char[n];
-        for (int i = 0; i < n; i++) {
-            ch[i] = input.readChar();
+        return readStringUTF(input).toCharArray();
+    }
+    
+    private String readStringUTF(DataInputStream input) throws IOException {
+        int frags = input.readInt();
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < frags; i++) {
+            sb.append(input.readUTF());
         }
-        return ch;
+        return sb.toString();
     }
 
 }
diff --git a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
index b5ea825..d450825 100644
--- a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
@@ -21,6 +21,8 @@ import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
 import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@@ -41,8 +43,8 @@ import java.util.concurrent.Semaphore;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.DublinCore;
 import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.DublinCore;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
@@ -55,7 +57,6 @@ import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerFactory;
 import org.apache.tika.sax.RecursiveParserWrapperHandler;
-import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -81,6 +82,8 @@ public class ForkParserTest extends TikaTest {
         }
     }
 
+
+
     @Test
     public void testSerialParsing() throws Exception {
         ForkParser parser = new ForkParser(
@@ -445,6 +448,17 @@ public class ForkParserTest extends TikaTest {
         assertEquals("/embed1.xml", m1.get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
     }
 
+    @Test
+    public void testNoUTFDataFormatException() throws Exception {
+        ContentHandlerProxy proxy = new ContentHandlerProxy(0);
+        DataOutputStream output = new DataOutputStream(new ByteArrayOutputStream());
+        proxy.init(null, output);
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < 65536; i++) {
+            sb.append(1);
+        }
+        proxy.skippedEntity(sb.toString());
+    }
 
     //use this to test that the wrapper handler is acted upon by the server but not proxied back
     private static class ToFileHandler extends AbstractRecursiveParserWrapperHandler {