You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/15 21:07:38 UTC

[tika] branch branch_1x updated: TIKA-3320 -- allow case-insensitive header request matching for pdf/ocr config in tika-server

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new f5a2aed  TIKA-3320 -- allow case-insensitive header request matching for pdf/ocr config in tika-server
     new 7ff6a49  Merge remote-tracking branch 'origin/branch_1x' into branch_1x
f5a2aed is described below

commit f5a2aed8ecd209353db831f2013941aed1d5447e
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 15 17:07:10 2021 -0400

    TIKA-3320 -- allow case-insensitive header request matching for pdf/ocr config in tika-server
---
 CHANGES.txt                                        |  3 ++
 .../apache/tika/server/resource/TikaResource.java  | 21 ++++++++----
 .../org/apache/tika/server/TikaResourceTest.java   | 39 ++++++++++++++++++++++
 3 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 57ca53c..c6718f1 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.26 - 03/09/2021
 
+   * Allow for case insensitive headers for configuration of the PDFParser
+     and the TesseractOCRParser in tika-server via Subhajit Das (TIKA-3320).
+
    * Improve detection and parsing of XPS files (TIKA-3316).
 
    * General dependency upgrades (TIKA-3244).
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index 973fa82..47763d1 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -38,6 +38,7 @@ import org.apache.tika.parser.ParserDecorator;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ExpandedTitleContentHandler;
@@ -156,10 +157,10 @@ public class TikaResource {
         PDFParserConfig pdfParserConfig = null;
         DocumentSelector documentSelector = null;
         for (String key : httpHeaders.keySet()) {
-            if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) {
+            if (StringUtils.startsWithIgnoreCase(key, X_TIKA_OCR_HEADER_PREFIX)) {
                 ocrConfig = (ocrConfig == null) ? new TesseractOCRConfig() : ocrConfig;
                 processHeaderConfig(httpHeaders, ocrConfig, key, X_TIKA_OCR_HEADER_PREFIX);
-            } else if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) {
+            } else if (StringUtils.startsWithIgnoreCase(key, X_TIKA_PDF_HEADER_PREFIX)) {
                 pdfParserConfig = (pdfParserConfig == null) ? new PDFParserConfig() : pdfParserConfig;
                 processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX);
             } else if (StringUtils.endsWithIgnoreCase(key, X_TIKA_SKIP_EMBEDDED_HEADER)) {
@@ -202,15 +203,23 @@ public class TikaResource {
      */
     private static void processHeaderConfig(MultivaluedMap<String, String> httpHeaders, Object object, String key, String prefix) {
 
-        try {String property = StringUtils.removeStart(key, prefix);
+        try {
+            String property = StringUtils.removeStartIgnoreCase(key, prefix);
             Field field = null;
             try {
                 field = object.getClass().getDeclaredField(StringUtils.uncapitalize(property));
             } catch (NoSuchFieldException e) {
-                //swallow
+                // try to match field case-insensitive way
+                for(Field aField : object.getClass().getDeclaredFields()) {
+                    if (aField.getName().equalsIgnoreCase(property)) {
+                        field = aField;
+                        break;
+                    }
+                }
             }
-            String setter = property;
-            setter = "set"+setter.substring(0,1).toUpperCase(Locale.US)+setter.substring(1);
+
+            String setter = field != null ? field.getName() : property;
+            setter = "set" + setter.substring(0, 1).toUpperCase(Locale.US) + setter.substring(1);
             //default assume string class
             //if there's a more specific type, e.g. double, int, boolean
             //try that.
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 81e3ed5..7a3816c 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -25,6 +25,7 @@ import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
 import org.apache.cxf.jaxrs.client.WebClient;
 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.server.resource.TikaResource;
@@ -40,6 +41,7 @@ import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Locale;
 import java.util.Set;
 
 import static org.apache.cxf.helpers.HttpHeaderHelper.CONTENT_ENCODING;
@@ -524,4 +526,41 @@ public class TikaResourceTest extends CXFTestBase {
         assertNotFound("embed4.txt", responseMsg);
     }
 
+    // TIKA-3320
+    @Test
+    public void testPDFLowerCaseOCRConfig() throws Exception {
+        if (! new TesseractOCRParser().hasTesseract(new TesseractOCRConfig())) {
+            return;
+        }
+
+        Response response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/pdf")
+                .accept("text/plain")
+                .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT)+"ocrstrategy", "no_ocr")
+                .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+        String responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+
+        assertTrue(responseMsg.trim().equals(""));
+
+
+
+        response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/pdf")
+                .accept("text/plain")
+                .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT)+"ocrstrategy", "ocr_only")
+                .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+        responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+        assertContains("Happy New Year 2003!", responseMsg);
+
+        //now try a bad value
+        response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/pdf")
+                .accept("text/plain")
+                .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT) + "ocrstrategy", "non-sense-value")
+                .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+        assertEquals(400, response.getStatus());
+    }
+
 }