You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2020/11/12 03:09:58 UTC

[tika] branch main updated: TIKA-3227: Added ability to skip parsing of embedded files in Tika Server through X-Tika-Skip-Embedded HTTP Header

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 7712894  TIKA-3227: Added ability to skip parsing of embedded files in Tika Server through X-Tika-Skip-Embedded HTTP Header
7712894 is described below

commit 771289445e261210e48fba34f3e298febf19b1df
Author: David Meikle <dm...@apache.org>
AuthorDate: Thu Nov 12 02:42:01 2020 +0000

    TIKA-3227: Added ability to skip parsing of embedded files in Tika Server through X-Tika-Skip-Embedded HTTP Header
---
 .../org/apache/tika/server/resource/TikaResource.java | 11 +++++++++++
 .../java/org/apache/tika/server/TikaResourceTest.java | 19 +++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index 0275b7e..015ffbd 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -26,6 +26,7 @@ import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.extractor.DocumentSelector;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
@@ -88,6 +89,7 @@ public class TikaResource {
     public static final String GREETING = "This is Tika Server (" + new Tika().toString() + "). Please PUT\n";
     public static final String X_TIKA_OCR_HEADER_PREFIX = "X-Tika-OCR";
     public static final String X_TIKA_PDF_HEADER_PREFIX = "X-Tika-PDF";
+    public static final String X_TIKA_SKIP_EMBEDDED_HEADER = "X-Tika-Skip-Embedded";
     public static final String PASSWORD = "Password";
     public static final String PASSWORD_BASE64_UTF8 = "Password_Base64_UTF-8";
 
@@ -150,6 +152,7 @@ public class TikaResource {
         //upon server startup will be ignored.
         TesseractOCRConfig ocrConfig = null;
         PDFParserConfig pdfParserConfig = null;
+        DocumentSelector documentSelector = null;
         for (String key : httpHeaders.keySet()) {
             if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) {
                 ocrConfig = (ocrConfig == null) ? new TesseractOCRConfig() : ocrConfig;
@@ -157,6 +160,11 @@ public class TikaResource {
             } else if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) {
                 pdfParserConfig = (pdfParserConfig == null) ? new PDFParserConfig() : pdfParserConfig;
                 processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX);
+            } else if (StringUtils.endsWithIgnoreCase(key, X_TIKA_SKIP_EMBEDDED_HEADER)) {
+                String skipEmbedded = httpHeaders.getFirst(key);
+                if (Boolean.parseBoolean(skipEmbedded)) {
+                    documentSelector = metadata -> false;
+                }
             }
         }
         if (ocrConfig != null) {
@@ -168,6 +176,9 @@ public class TikaResource {
         if (embeddedParser != null) {
             parseContext.set(Parser.class, embeddedParser);
         }
+        if (documentSelector != null) {
+            parseContext.set(DocumentSelector.class, documentSelector);
+        }
     }
 
     public static InputStream getInputStream(InputStream is, Metadata metadata, HttpHeaders headers) {
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 6b6fa23..81e3ed5 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -505,4 +505,23 @@ public class TikaResourceTest extends CXFTestBase {
                 .getEntity());
         assertContains("Just some text.", responseMsg);
     }
+
+    // TIKA-3227
+    @Test
+    public void testSkipEmbedded() throws Exception {
+        Response response = WebClient.create(endPoint + TIKA_PATH)
+                .accept("text/plain")
+                .header(TikaResource.X_TIKA_SKIP_EMBEDDED_HEADER, "false")
+                .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
+        assertContains("embed4.txt", responseMsg);
+
+        response = WebClient.create(endPoint + TIKA_PATH)
+                .accept("text/plain")
+                .header(TikaResource.X_TIKA_SKIP_EMBEDDED_HEADER, "true")
+                .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        responseMsg = getStringFromInputStream((InputStream) response.getEntity());
+        assertNotFound("embed4.txt", responseMsg);
+    }
+
 }