You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/15 20:49:02 UTC

[tika] branch main updated: Added case-insensitivity to tika server ocr header names (#414)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 06769d3  Added case-insensitivity to tika server ocr header names (#414)
06769d3 is described below

commit 06769d336ff314d8243decd697a8e520c954afc6
Author: Subhajit Das <Su...@users.noreply.github.com>
AuthorDate: Tue Mar 16 02:18:52 2021 +0530

    Added case-insensitivity to tika server ocr header names (#414)
---
 .../server/classic/config/PDFServerConfig.java     |  2 +-
 .../classic/config/TesseractServerConfig.java      |  2 +-
 .../tika/server/classic/TikaResourceTest.java      | 47 ++++++++++++++++++++++
 .../tika/server/core/resource/TikaResource.java    | 14 +++++--
 4 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/PDFServerConfig.java b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/PDFServerConfig.java
index 9dcf61d..9058272 100644
--- a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/PDFServerConfig.java
+++ b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/PDFServerConfig.java
@@ -40,7 +40,7 @@ public class PDFServerConfig implements ParseContextConfig {
         //upon server startup will be ignored.
         PDFParserConfig pdfParserConfig = null;
         for (String key : httpHeaders.keySet()) {
-            if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) {
+            if (StringUtils.startsWithIgnoreCase(key, X_TIKA_PDF_HEADER_PREFIX)) {
                 pdfParserConfig = (pdfParserConfig == null) ? new PDFParserConfig() : pdfParserConfig;
                 processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX);
             }
diff --git a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/TesseractServerConfig.java b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/TesseractServerConfig.java
index 3db0859..3041400 100644
--- a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/TesseractServerConfig.java
+++ b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/TesseractServerConfig.java
@@ -40,7 +40,7 @@ public class TesseractServerConfig implements ParseContextConfig {
         TesseractOCRConfig ocrConfig = null;
         DocumentSelector documentSelector = null;
         for (String key : httpHeaders.keySet()) {
-            if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) {
+            if (StringUtils.startsWithIgnoreCase(key, X_TIKA_OCR_HEADER_PREFIX)) {
                 ocrConfig = (ocrConfig == null) ? new TesseractOCRConfig() : ocrConfig;
                 processHeaderConfig(httpHeaders, ocrConfig, key, X_TIKA_OCR_HEADER_PREFIX);
             }
diff --git a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
index d80798a..7104484 100644
--- a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
+++ b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
@@ -42,6 +42,7 @@ import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Locale;
 
 import static org.apache.cxf.helpers.HttpHeaderHelper.CONTENT_ENCODING;
 import static org.junit.Assert.assertEquals;
@@ -351,6 +352,52 @@ public class TikaResourceTest extends CXFTestBase {
         assertEquals(400, response.getStatus());
     }
 
+    // TIKA-3320
+    @Test
+    public void testPDFLowerCaseOCRConfig() throws Exception {
+        if (! new TesseractOCRParser().hasTesseract()) {
+            return;
+        }
+
+        Response response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/pdf")
+                .accept("text/plain")
+                .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT)+"ocrstrategy", "no_ocr")
+                .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+        String responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+
+        assertTrue(responseMsg.trim().equals(""));
+
+        response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/pdf")
+                .accept("text/plain")
+                .header(TesseractServerConfig.X_TIKA_OCR_HEADER_PREFIX.toLowerCase(Locale.ROOT)+"skipocr", "true")
+                .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+        responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+
+        assertTrue(responseMsg.trim().equals(""));
+
+
+        response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/pdf")
+                .accept("text/plain")
+                .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT)+"ocrstrategy", "ocr_only")
+                .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+        responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+        assertContains("Happy New Year 2003!", responseMsg);
+
+        //now try a bad value
+        response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/pdf")
+                .accept("text/plain")
+                .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT) + "ocrstrategy", "non-sense-value")
+                .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+        assertEquals(400, response.getStatus());
+    }
+
     //TIKA-2669
     @Test
     public void testPDFConfig() throws Exception {
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 26c6827..d1b25a6 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -165,15 +165,21 @@ public class TikaResource {
         val = val.trim();
 
         try {
-            String property = StringUtils.removeStart(key, prefix);
+            String property = StringUtils.removeStartIgnoreCase(key, prefix);
             Field field = null;
             try {
                 field = object.getClass().getDeclaredField(StringUtils.uncapitalize(property));
             } catch (NoSuchFieldException e) {
-                //swallow
+                // try to match field case-insensitive way
+                for(Field aField : object.getClass().getDeclaredFields()) {
+                    if (aField.getName().equalsIgnoreCase(property)) {
+                        field = aField;
+                        break;
+                    }
+                }
             }
-            String setter = property;
-            setter = "set"+setter.substring(0,1).toUpperCase(Locale.US)+setter.substring(1);
+            String setter = field != null ? field.getName() : property;
+            setter = "set" + setter.substring(0, 1).toUpperCase(Locale.US) + setter.substring(1);
             //default assume string class
             //if there's a more specific type, e.g. double, int, boolean
             //try that.