You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/15 20:49:02 UTC
[tika] branch main updated: Added case-insensitivity to tika server
ocr header names (#414)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 06769d3 Added case-insensitivity to tika server ocr header names (#414)
06769d3 is described below
commit 06769d336ff314d8243decd697a8e520c954afc6
Author: Subhajit Das <Su...@users.noreply.github.com>
AuthorDate: Tue Mar 16 02:18:52 2021 +0530
Added case-insensitivity to tika server ocr header names (#414)
---
.../server/classic/config/PDFServerConfig.java | 2 +-
.../classic/config/TesseractServerConfig.java | 2 +-
.../tika/server/classic/TikaResourceTest.java | 47 ++++++++++++++++++++++
.../tika/server/core/resource/TikaResource.java | 14 +++++--
4 files changed, 59 insertions(+), 6 deletions(-)
diff --git a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/PDFServerConfig.java b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/PDFServerConfig.java
index 9dcf61d..9058272 100644
--- a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/PDFServerConfig.java
+++ b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/PDFServerConfig.java
@@ -40,7 +40,7 @@ public class PDFServerConfig implements ParseContextConfig {
//upon server startup will be ignored.
PDFParserConfig pdfParserConfig = null;
for (String key : httpHeaders.keySet()) {
- if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) {
+ if (StringUtils.startsWithIgnoreCase(key, X_TIKA_PDF_HEADER_PREFIX)) {
pdfParserConfig = (pdfParserConfig == null) ? new PDFParserConfig() : pdfParserConfig;
processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX);
}
diff --git a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/TesseractServerConfig.java b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/TesseractServerConfig.java
index 3db0859..3041400 100644
--- a/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/TesseractServerConfig.java
+++ b/tika-server/tika-server-classic/src/main/java/org/apache/tika/server/classic/config/TesseractServerConfig.java
@@ -40,7 +40,7 @@ public class TesseractServerConfig implements ParseContextConfig {
TesseractOCRConfig ocrConfig = null;
DocumentSelector documentSelector = null;
for (String key : httpHeaders.keySet()) {
- if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) {
+ if (StringUtils.startsWithIgnoreCase(key, X_TIKA_OCR_HEADER_PREFIX)) {
ocrConfig = (ocrConfig == null) ? new TesseractOCRConfig() : ocrConfig;
processHeaderConfig(httpHeaders, ocrConfig, key, X_TIKA_OCR_HEADER_PREFIX);
}
diff --git a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
index d80798a..7104484 100644
--- a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
+++ b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
@@ -42,6 +42,7 @@ import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
+import java.util.Locale;
import static org.apache.cxf.helpers.HttpHeaderHelper.CONTENT_ENCODING;
import static org.junit.Assert.assertEquals;
@@ -351,6 +352,52 @@ public class TikaResourceTest extends CXFTestBase {
assertEquals(400, response.getStatus());
}
+ // TIKA-3320
+ @Test
+ public void testPDFLowerCaseOCRConfig() throws Exception {
+ if (! new TesseractOCRParser().hasTesseract()) {
+ return;
+ }
+
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+ .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT)+"ocrstrategy", "no_ocr")
+ .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+ String responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+
+ assertTrue(responseMsg.trim().equals(""));
+
+ response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+ .header(TesseractServerConfig.X_TIKA_OCR_HEADER_PREFIX.toLowerCase(Locale.ROOT)+"skipocr", "true")
+ .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+ responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+
+ assertTrue(responseMsg.trim().equals(""));
+
+
+ response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+ .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT)+"ocrstrategy", "ocr_only")
+ .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+ responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertContains("Happy New Year 2003!", responseMsg);
+
+ //now try a bad value
+ response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+ .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT) + "ocrstrategy", "non-sense-value")
+ .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+ assertEquals(400, response.getStatus());
+ }
+
//TIKA-2669
@Test
public void testPDFConfig() throws Exception {
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 26c6827..d1b25a6 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -165,15 +165,21 @@ public class TikaResource {
val = val.trim();
try {
- String property = StringUtils.removeStart(key, prefix);
+ String property = StringUtils.removeStartIgnoreCase(key, prefix);
Field field = null;
try {
field = object.getClass().getDeclaredField(StringUtils.uncapitalize(property));
} catch (NoSuchFieldException e) {
- //swallow
+ // try to match field case-insensitive way
+ for(Field aField : object.getClass().getDeclaredFields()) {
+ if (aField.getName().equalsIgnoreCase(property)) {
+ field = aField;
+ break;
+ }
+ }
}
- String setter = property;
- setter = "set"+setter.substring(0,1).toUpperCase(Locale.US)+setter.substring(1);
+ String setter = field != null ? field.getName() : property;
+ setter = "set" + setter.substring(0, 1).toUpperCase(Locale.US) + setter.substring(1);
//default assume string class
//if there's a more specific type, e.g. double, int, boolean
//try that.