You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/15 21:07:38 UTC
[tika] branch branch_1x updated: TIKA-3320 -- allow
case-insensitive header request matching for pdf/ocr config in tika-server
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new f5a2aed TIKA-3320 -- allow case-insensitive header request matching for pdf/ocr config in tika-server
new 7ff6a49 Merge remote-tracking branch 'origin/branch_1x' into branch_1x
f5a2aed is described below
commit f5a2aed8ecd209353db831f2013941aed1d5447e
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 15 17:07:10 2021 -0400
TIKA-3320 -- allow case-insensitive header request matching for pdf/ocr config in tika-server
---
CHANGES.txt | 3 ++
.../apache/tika/server/resource/TikaResource.java | 21 ++++++++----
.../org/apache/tika/server/TikaResourceTest.java | 39 ++++++++++++++++++++++
3 files changed, 57 insertions(+), 6 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 57ca53c..c6718f1 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.26 - 03/09/2021
+ * Allow for case insensitive headers for configuration of the PDFParser
+ and the TesseractOCRParser in tika-server via Subhajit Das (TIKA-3320).
+
* Improve detection and parsing of XPS files (TIKA-3316).
* General dependency upgrades (TIKA-3244).
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index 973fa82..47763d1 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -38,6 +38,7 @@ import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
@@ -156,10 +157,10 @@ public class TikaResource {
PDFParserConfig pdfParserConfig = null;
DocumentSelector documentSelector = null;
for (String key : httpHeaders.keySet()) {
- if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) {
+ if (StringUtils.startsWithIgnoreCase(key, X_TIKA_OCR_HEADER_PREFIX)) {
ocrConfig = (ocrConfig == null) ? new TesseractOCRConfig() : ocrConfig;
processHeaderConfig(httpHeaders, ocrConfig, key, X_TIKA_OCR_HEADER_PREFIX);
- } else if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) {
+ } else if (StringUtils.startsWithIgnoreCase(key, X_TIKA_PDF_HEADER_PREFIX)) {
pdfParserConfig = (pdfParserConfig == null) ? new PDFParserConfig() : pdfParserConfig;
processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX);
} else if (StringUtils.endsWithIgnoreCase(key, X_TIKA_SKIP_EMBEDDED_HEADER)) {
@@ -202,15 +203,23 @@ public class TikaResource {
*/
private static void processHeaderConfig(MultivaluedMap<String, String> httpHeaders, Object object, String key, String prefix) {
- try {String property = StringUtils.removeStart(key, prefix);
+ try {
+ String property = StringUtils.removeStartIgnoreCase(key, prefix);
Field field = null;
try {
field = object.getClass().getDeclaredField(StringUtils.uncapitalize(property));
} catch (NoSuchFieldException e) {
- //swallow
+ // try to match field case-insensitive way
+ for(Field aField : object.getClass().getDeclaredFields()) {
+ if (aField.getName().equalsIgnoreCase(property)) {
+ field = aField;
+ break;
+ }
+ }
}
- String setter = property;
- setter = "set"+setter.substring(0,1).toUpperCase(Locale.US)+setter.substring(1);
+
+ String setter = field != null ? field.getName() : property;
+ setter = "set" + setter.substring(0, 1).toUpperCase(Locale.US) + setter.substring(1);
//default assume string class
//if there's a more specific type, e.g. double, int, boolean
//try that.
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 81e3ed5..7a3816c 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -25,6 +25,7 @@ import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.server.resource.TikaResource;
@@ -40,6 +41,7 @@ import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
+import java.util.Locale;
import java.util.Set;
import static org.apache.cxf.helpers.HttpHeaderHelper.CONTENT_ENCODING;
@@ -524,4 +526,41 @@ public class TikaResourceTest extends CXFTestBase {
assertNotFound("embed4.txt", responseMsg);
}
+ // TIKA-3320
+ @Test
+ public void testPDFLowerCaseOCRConfig() throws Exception {
+ if (! new TesseractOCRParser().hasTesseract(new TesseractOCRConfig())) {
+ return;
+ }
+
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+ .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT)+"ocrstrategy", "no_ocr")
+ .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+ String responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+
+ assertTrue(responseMsg.trim().equals(""));
+
+
+
+ response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+ .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT)+"ocrstrategy", "ocr_only")
+ .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+ responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertContains("Happy New Year 2003!", responseMsg);
+
+ //now try a bad value
+ response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+ .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT) + "ocrstrategy", "non-sense-value")
+ .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+ assertEquals(400, response.getStatus());
+ }
+
}