You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/08/05 13:54:32 UTC

[tika] branch main updated: TIKA-3827 -- override image mime if raw bitmap in RTF

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 99533c971 TIKA-3827 -- override image mime if raw bitmap in RTF
99533c971 is described below

commit 99533c971d5db7d7f3c501bc6cf67082a8d7f0cc
Author: tallison <ta...@apache.org>
AuthorDate: Fri Aug 5 09:54:13 2022 -0400

    TIKA-3827 -- override image mime if raw bitmap in RTF
---
 .../apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java    | 11 +++++++++++
 .../org/apache/tika/parser/microsoft/rtf/TextExtractor.java   |  2 ++
 2 files changed, 13 insertions(+)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
index a927f5da6..096a6f66c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
@@ -63,6 +63,8 @@ class RTFEmbObjHandler {
     private final EmbeddedDocumentUtil embeddedDocumentUtil;
     private final ByteArrayOutputStream os;
     private final int memoryLimitInKb;
+
+    private boolean isPictBitmap = false;
     //high hex cached for writing hexpair chars (data)
     private int hi = -1;
     private int thumbCount = 0;
@@ -127,6 +129,10 @@ class RTFEmbObjHandler {
         sb.append(c);
     }
 
+    protected void setPictBitmap(boolean isPictBitmap) {
+        this.isPictBitmap = isPictBitmap;
+    }
+
     protected void writeHexChar(int b) throws IOException, TikaException {
         //if not hexchar, ignore
         //white space is common
@@ -189,6 +195,10 @@ class RTFEmbObjHandler {
                 metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, filePath);
             }
             metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject));
+            if (isPictBitmap) {
+                metadata.set(
+                        TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, "image/x-rtf-raw-bitmap");
+            }
             extractObj(bytes, handler, metadata);
 
         } else if (state == EMB_STATE.NADA) {
@@ -243,6 +253,7 @@ class RTFEmbObjHandler {
         sv = EMPTY_STRING;
         sn = EMPTY_STRING;
         sb.setLength(0);
+        isPictBitmap = false;
     }
 
     private enum EMB_STATE {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
index 0bec3c5c8..9388b8461 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
@@ -999,6 +999,8 @@ final class TextExtractor {
                 groupState.list = param;
             } else if (equals("lslvl")) {
                 groupState.listLevel = param;
+            } else if (equals("wbitmap")) {
+                embObjHandler.setPictBitmap(true);
             }
         }