You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/04/08 15:34:38 UTC
[tika] 01/14: use byte buffers when reading the legacy OneNote 2007
files (#314)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 18b6645f7af34aca883ee53a59673b7ae162b005
Author: Nicholas DiPiazza <ni...@lucidworks.com>
AuthorDate: Mon Mar 30 08:58:11 2020 -0500
use byte buffers when reading the legacy OneNote 2007 files (#314)
---
.../onenote/OneNoteLegacyDumpStrings.java | 87 +++++++++++++---------
1 file changed, 53 insertions(+), 34 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
index 27b011e..bdcff02 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteLegacyDumpStrings.java
@@ -22,6 +22,7 @@ import org.xml.sax.SAXException;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
+import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
/**
@@ -38,7 +39,8 @@ class OneNoteLegacyDumpStrings {
public static int MIN_STRING_LENGTH = 8;
// TODO - parameterize this
public static float ACCEPTABLE_ALPHA_TO_OTHER_CHAR_RATIO = 0.6f;
-
+ // TODO - parameterize this
+ public static long BUFFER_SIZE = 1000000L;
OneNoteDirectFileResource oneNoteDirectFileResource;
XHTMLContentHandler xhtml;
@@ -63,62 +65,79 @@ class OneNoteLegacyDumpStrings {
private void dumpAscii() throws SAXException, TikaException {
try {
oneNoteDirectFileResource.position(0);
-
ByteArrayOutputStream os = new ByteArrayOutputStream();
-
- for (int b = oneNoteDirectFileResource.read(); b != -1; b = oneNoteDirectFileResource.read()) {
- if (b >= 0x20 && b < 0x7F) {
- os.write(b);
- } else {
- if (os.size() >= MIN_STRING_LENGTH) {
- writeIfUseful(os);
+ long sz = oneNoteDirectFileResource.size();
+ long pos;
+ while ((pos = oneNoteDirectFileResource.position()) != sz) {
+ long nextBufferSize = BUFFER_SIZE;
+ if (sz - pos < BUFFER_SIZE) {
+ nextBufferSize = sz - pos;
+ }
+ ByteBuffer byteBuffer = ByteBuffer.allocate((int)nextBufferSize);
+ oneNoteDirectFileResource.read(byteBuffer);
+ for (long i = 0; i < nextBufferSize - 1; ++i) {
+ int b = byteBuffer.get((int) i);
+ if (b >= 0x20 && b < 0x7F) {
+ os.write(b);
+ } else {
+ if (os.size() >= MIN_STRING_LENGTH) {
+ writeIfUseful(os);
+ }
+ os.reset();
}
- os.reset();
}
- }
- if (os.size() >= MIN_STRING_LENGTH) {
- writeIfUseful(os);
+ if (os.size() >= MIN_STRING_LENGTH) {
+ writeIfUseful(os);
+ }
}
} catch (IOException e) {
throw new TikaException("Could not extract text from legacy OneNote document", e);
}
}
-
/**
* Based on GNU "strings" implementation. Pulls out UTF16 LE text segments and writes them to the XHTMLContentHandler.
*/
private void dumpUtf16LE() throws SAXException, TikaException {
try {
oneNoteDirectFileResource.position(0);
-
ByteArrayOutputStream os = new ByteArrayOutputStream();
-
long sz = oneNoteDirectFileResource.size();
+ long bufSize = BUFFER_SIZE;
+ // Make sure the buffer size is a multiple of 2.
+ if (bufSize % 2 == 1) {
+ bufSize += 1L;
+ }
- for (long i = 0; i < sz - 1; ++i) {
- oneNoteDirectFileResource.position(i);
-
- int c1 = oneNoteDirectFileResource.read();
- int c2 = oneNoteDirectFileResource.read();
-
- if (c1 == 0x00 && c2 >= 0x20 && c2 < 0x7F) {
- ++i;
- os.write(c2);
- } else {
- if (os.size() >= MIN_STRING_LENGTH) {
- writeIfUseful(os);
+ long pos;
+ while ((pos = oneNoteDirectFileResource.position()) != sz) {
+ long nextBufferSize = bufSize;
+ if (sz - pos < bufSize) {
+ nextBufferSize = sz - pos;
+ }
+ ByteBuffer byteBuffer = ByteBuffer.allocate((int)nextBufferSize);
+ oneNoteDirectFileResource.read(byteBuffer);
+
+ for (long i = 0; i < nextBufferSize - 1; ++i) {
+ int c1 = byteBuffer.get((int)i);
+ int c2 = byteBuffer.get((int)i+1);
+ if (c1 == 0x00 && c2 >= 0x20 && c2 < 0x7F) {
+ ++i;
+ os.write(c2);
+ } else {
+ if (os.size() >= MIN_STRING_LENGTH) {
+ writeIfUseful(os);
+ }
+ os.reset();
}
- os.reset();
}
- }
- if (os.size() >= MIN_STRING_LENGTH) {
- writeIfUseful(os);
+ if (os.size() >= MIN_STRING_LENGTH) {
+ writeIfUseful(os);
+ }
}
} catch (IOException e) {
throw new TikaException("Could not extract text from legacy OneNote document", e);
}
}
-
/**
* Writes a buffer of output characters if the (num alpha chars in the buffer) / (number of chars in the buffer) >
* ACCEPTABLE_ALPHA_TO_OTHER_CHAR_RATIO.
@@ -141,4 +160,4 @@ class OneNoteLegacyDumpStrings {
}
}
}
-}
+}
\ No newline at end of file