You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/08/05 13:54:32 UTC
[tika] branch main updated: TIKA-3827 -- override image mime if raw bitmap in RTF
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 99533c971 TIKA-3827 -- override image mime if raw bitmap in RTF
99533c971 is described below
commit 99533c971d5db7d7f3c501bc6cf67082a8d7f0cc
Author: tallison <ta...@apache.org>
AuthorDate: Fri Aug 5 09:54:13 2022 -0400
TIKA-3827 -- override image mime if raw bitmap in RTF
---
.../apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java | 11 +++++++++++
.../org/apache/tika/parser/microsoft/rtf/TextExtractor.java | 2 ++
2 files changed, 13 insertions(+)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
index a927f5da6..096a6f66c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
@@ -63,6 +63,8 @@ class RTFEmbObjHandler {
private final EmbeddedDocumentUtil embeddedDocumentUtil;
private final ByteArrayOutputStream os;
private final int memoryLimitInKb;
+
+ private boolean isPictBitmap = false;
//high hex cached for writing hexpair chars (data)
private int hi = -1;
private int thumbCount = 0;
@@ -127,6 +129,10 @@ class RTFEmbObjHandler {
sb.append(c);
}
+ protected void setPictBitmap(boolean isPictBitmap) {
+ this.isPictBitmap = isPictBitmap;
+ }
+
protected void writeHexChar(int b) throws IOException, TikaException {
//if not hexchar, ignore
//white space is common
@@ -189,6 +195,10 @@ class RTFEmbObjHandler {
metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, filePath);
}
metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject));
+ if (isPictBitmap) {
+ metadata.set(
+ TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, "image/x-rtf-raw-bitmap");
+ }
extractObj(bytes, handler, metadata);
} else if (state == EMB_STATE.NADA) {
@@ -243,6 +253,7 @@ class RTFEmbObjHandler {
sv = EMPTY_STRING;
sn = EMPTY_STRING;
sb.setLength(0);
+ isPictBitmap = false;
}
private enum EMB_STATE {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
index 0bec3c5c8..9388b8461 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
@@ -999,6 +999,8 @@ final class TextExtractor {
groupState.list = param;
} else if (equals("lslvl")) {
groupState.listLevel = param;
+ } else if (equals("wbitmap")) {
+ embObjHandler.setPictBitmap(true);
}
}