You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/04/08 15:34:38 UTC

[tika] 01/14: use byte buffers when reading the legacy OneNote 2007 files (#314)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 18b6645f7af34aca883ee53a59673b7ae162b005
Author: Nicholas DiPiazza <ni...@lucidworks.com>
AuthorDate: Mon Mar 30 08:58:11 2020 -0500

    use byte buffers when reading the legacy OneNote 2007 files (#314)
---
 .../onenote/OneNoteLegacyDumpStrings.java          | 87 +++++++++++++---------
 1 file changed, 53 insertions(+), 34 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
index 27b011e..bdcff02 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
@@ -22,6 +22,7 @@ import org.xml.sax.SAXException;
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 
 /**
@@ -38,7 +39,8 @@ class OneNoteLegacyDumpStrings {
     public static int MIN_STRING_LENGTH = 8;
     // TODO - parameterize this
     public static float ACCEPTABLE_ALPHA_TO_OTHER_CHAR_RATIO = 0.6f;
-
+    // TODO - parameterize this
+    public static long BUFFER_SIZE = 1000000L;
     OneNoteDirectFileResource oneNoteDirectFileResource;
     XHTMLContentHandler xhtml;
 
@@ -63,62 +65,79 @@ class OneNoteLegacyDumpStrings {
     private void dumpAscii() throws SAXException, TikaException {
         try {
             oneNoteDirectFileResource.position(0);
-
             ByteArrayOutputStream os = new ByteArrayOutputStream();
-
-            for (int b = oneNoteDirectFileResource.read(); b != -1; b = oneNoteDirectFileResource.read()) {
-                if (b >= 0x20 && b < 0x7F) {
-                    os.write(b);
-                } else {
-                    if (os.size() >= MIN_STRING_LENGTH) {
-                        writeIfUseful(os);
+            long sz = oneNoteDirectFileResource.size();
+            long pos;
+            while ((pos = oneNoteDirectFileResource.position()) != sz) {
+                long nextBufferSize = BUFFER_SIZE;
+                if (sz - pos < BUFFER_SIZE) {
+                    nextBufferSize = sz - pos;
+                }
+                ByteBuffer byteBuffer = ByteBuffer.allocate((int)nextBufferSize);
+                oneNoteDirectFileResource.read(byteBuffer);
+                for (long i = 0; i < nextBufferSize - 1; ++i) {
+                    int b = byteBuffer.get((int) i);
+                    if (b >= 0x20 && b < 0x7F) {
+                        os.write(b);
+                    } else {
+                        if (os.size() >= MIN_STRING_LENGTH) {
+                            writeIfUseful(os);
+                        }
+                        os.reset();
                     }
-                    os.reset();
                 }
-            }
-            if (os.size() >= MIN_STRING_LENGTH) {
-                writeIfUseful(os);
+                if (os.size() >= MIN_STRING_LENGTH) {
+                    writeIfUseful(os);
+                }
             }
         } catch (IOException e) {
             throw new TikaException("Could not extract text from legacy OneNote document", e);
         }
     }
-
     /**
      * Based on GNU "strings" implementation. Pulls out UTF16 LE text segments and writes them to the XHTMLContentHandler.
      */
     private void dumpUtf16LE() throws SAXException, TikaException {
         try {
             oneNoteDirectFileResource.position(0);
-
             ByteArrayOutputStream os = new ByteArrayOutputStream();
-
             long sz = oneNoteDirectFileResource.size();
+            long bufSize = BUFFER_SIZE;
+            // Make sure the buffer size is a multiple of 2.
+            if (bufSize % 2 == 1) {
+                bufSize += 1L;
+            }
 
-            for (long i = 0; i < sz - 1; ++i) {
-                oneNoteDirectFileResource.position(i);
-
-                int c1 = oneNoteDirectFileResource.read();
-                int c2 = oneNoteDirectFileResource.read();
-
-                if (c1 == 0x00 && c2 >= 0x20 && c2 < 0x7F) {
-                    ++i;
-                    os.write(c2);
-                } else {
-                    if (os.size() >= MIN_STRING_LENGTH) {
-                        writeIfUseful(os);
+            long pos;
+            while ((pos = oneNoteDirectFileResource.position()) != sz) {
+                long nextBufferSize = bufSize;
+                if (sz - pos < bufSize) {
+                    nextBufferSize = sz - pos;
+                }
+                ByteBuffer byteBuffer = ByteBuffer.allocate((int)nextBufferSize);
+                oneNoteDirectFileResource.read(byteBuffer);
+
+                for (long i = 0; i < nextBufferSize - 1; ++i) {
+                    int c1 = byteBuffer.get((int)i);
+                    int c2 = byteBuffer.get((int)i+1);
+                    if (c1 == 0x00 && c2 >= 0x20 && c2 < 0x7F) {
+                        ++i;
+                        os.write(c2);
+                    } else {
+                        if (os.size() >= MIN_STRING_LENGTH) {
+                            writeIfUseful(os);
+                        }
+                        os.reset();
                     }
-                    os.reset();
                 }
-            }
-            if (os.size() >= MIN_STRING_LENGTH) {
-                writeIfUseful(os);
+                if (os.size() >= MIN_STRING_LENGTH) {
+                    writeIfUseful(os);
+                }
             }
         } catch (IOException e) {
             throw new TikaException("Could not extract text from legacy OneNote document", e);
         }
     }
-
     /**
      * Writes a buffer of output characters if the (num alpha chars in the buffer) / (number of chars in the buffer) >
      * ACCEPTABLE_ALPHA_TO_OTHER_CHAR_RATIO.
@@ -141,4 +160,4 @@ class OneNoteLegacyDumpStrings {
             }
         }
     }
-}
+}
\ No newline at end of file