You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/09 16:19:16 UTC

[tika] branch main updated: [TIKA-3344] [TIKA-3345] main (#424)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 6262305  [TIKA-3344] [TIKA-3345] main (#424)
6262305 is described below

commit 6262305632b84cb28a299b9543d2e50876fc9f6a
Author: Subhajit Das <Su...@users.noreply.github.com>
AuthorDate: Fri Apr 9 21:46:54 2021 +0530

    [TIKA-3344] [TIKA-3345] main (#424)
    
    * Added case-insensitivity to tika server ocr header names
    
    * Minor restructure and added missing javadoc
    
    * TIKA-3344 and TIKA-3345 for main
    
    * Fixed checkstyle
    
    * Fixed checkstyle
    
    * Fixed checkstyle
    
    * [TIKA-3344] [TIKA-3345] main - checkstyle violation fix
---
 .../tika/server/classic/TikaResourceTest.java      | 80 ++++++++++++++++++----
 .../tika/server/core/resource/TikaResource.java    | 58 ++++++++++++----
 2 files changed, 111 insertions(+), 27 deletions(-)

diff --git a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
index 4dc3b21..54a5909 100644
--- a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
+++ b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
@@ -28,6 +28,7 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 import javax.ws.rs.ProcessingException;
+import javax.ws.rs.core.MediaType;
 import javax.ws.rs.core.Response;
 
 import org.apache.commons.codec.binary.Base64;
@@ -36,6 +37,8 @@ import org.apache.cxf.attachment.AttachmentUtil;
 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
 import org.apache.cxf.jaxrs.client.WebClient;
 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.cxf.jaxrs.ext.multipart.ContentDisposition;
+import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
 import org.junit.Test;
 
@@ -57,6 +60,7 @@ public class TikaResourceTest extends CXFTestBase {
     private static final String STREAM_CLOSED_FAULT = "java.io.IOException: Stream Closed";
 
     private static final String TIKA_PATH = "/tika";
+    private static final String TIKA_POST_PATH = "/tika/form";
     private static final int UNPROCESSEABLE = 422;
 
     @Override
@@ -68,7 +72,7 @@ public class TikaResourceTest extends CXFTestBase {
 
     @Override
     protected void setUpProviders(JAXRSServerFactoryBean sf) {
-        List<Object> providers = new ArrayList<Object>();
+        List<Object> providers = new ArrayList<>();
         providers.add(new TikaServerParseExceptionMapper(false));
         sf.setProviders(providers);
     }
@@ -158,8 +162,9 @@ public class TikaResourceTest extends CXFTestBase {
 
 
     @Test
-    public void testPasswordXLS() throws Exception {
-        Response response = WebClient.create(endPoint + TIKA_PATH).type("application/vnd.ms-excel")
+    public void testPasswordXLS() {
+        Response response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/vnd.ms-excel")
                 .accept("text/plain")
                 .put(ClassLoader.getSystemResourceAsStream("test-documents/password.xls"));
 
@@ -181,8 +186,9 @@ public class TikaResourceTest extends CXFTestBase {
     }
 
     @Test
-    public void testPasswordXLSHTML() throws Exception {
-        Response response = WebClient.create(endPoint + TIKA_PATH).type("application/vnd.ms-excel")
+    public void testPasswordXLSHTML() {
+        Response response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/vnd.ms-excel")
                 .accept("text/html")
                 .put(ClassLoader.getSystemResourceAsStream("test-documents/password.xls"));
 
@@ -199,8 +205,9 @@ public class TikaResourceTest extends CXFTestBase {
     }
 
     @Test
-    public void testPasswordXLSXML() throws Exception {
-        Response response = WebClient.create(endPoint + TIKA_PATH).type("application/vnd.ms-excel")
+    public void testPasswordXLSXML() {
+        Response response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/vnd.ms-excel")
                 .accept("text/xml")
                 .put(ClassLoader.getSystemResourceAsStream("test-documents/password.xls"));
 
@@ -290,7 +297,7 @@ public class TikaResourceTest extends CXFTestBase {
                         .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
         String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
 
-        assertTrue(responseMsg.trim().equals(""));
+        assertEquals("", responseMsg.trim());
 
         response =
                 WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain")
@@ -298,7 +305,7 @@ public class TikaResourceTest extends CXFTestBase {
                         .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
         responseMsg = getStringFromInputStream((InputStream) response.getEntity());
 
-        assertTrue(responseMsg.trim().equals(""));
+        assertEquals("", responseMsg.trim());
 
 
         response =
@@ -332,7 +339,7 @@ public class TikaResourceTest extends CXFTestBase {
                         .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
         String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
 
-        assertTrue(responseMsg.trim().equals(""));
+        assertEquals("", responseMsg.trim());
 
         response =
                 WebClient.create(endPoint + TIKA_PATH).type("application/pdf").accept("text/plain")
@@ -341,7 +348,7 @@ public class TikaResourceTest extends CXFTestBase {
                         .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
         responseMsg = getStringFromInputStream((InputStream) response.getEntity());
 
-        assertTrue(responseMsg.trim().equals(""));
+        assertEquals("", responseMsg.trim());
 
 
         response =
@@ -417,8 +424,8 @@ public class TikaResourceTest extends CXFTestBase {
     }
 
     @Test
-    public void testDataIntegrityCheck() throws Exception {
-        Response response = null;
+    public void testDataIntegrityCheck() {
+        Response response;
         try {
             response = WebClient.create(endPoint + TIKA_PATH).type("application/pdf")
                     .accept("text/plain")
@@ -465,7 +472,6 @@ public class TikaResourceTest extends CXFTestBase {
 
     }
 
-
     @Test
     public void testUnicodePasswordProtectedSpaces() throws Exception {
         //TIKA-2858
@@ -510,4 +516,50 @@ public class TikaResourceTest extends CXFTestBase {
         assertNotFound("embed4.txt", responseMsg);
     }
 
+    // TIKA-3344
+    @Test
+    public void testPDFLowerCaseOCRConfigPOST() throws Exception {
+        if (! new TesseractOCRParser().hasTesseract()) {
+            return;
+        }
+
+        Response response = WebClient.create(endPoint + TIKA_POST_PATH)
+                .type("application/pdf")
+                .accept(MediaType.TEXT_PLAIN).type(MediaType.MULTIPART_FORM_DATA)
+                .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT)
+                        + "ocrstrategy", "no_ocr")
+                .post(testPDFLowerCaseOCRConfigPOSTBody());
+        String responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+
+        assertEquals("", responseMsg.trim());
+
+        response = WebClient.create(endPoint + TIKA_POST_PATH)
+                .type("application/pdf")
+                .accept(MediaType.TEXT_PLAIN).type(MediaType.MULTIPART_FORM_DATA)
+                .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT)
+                        + "ocrstrategy", "ocr_only")
+                .post(testPDFLowerCaseOCRConfigPOSTBody());
+        responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+        assertContains("Happy New Year 2003!", responseMsg);
+
+        //now try a bad value
+        response = WebClient.create(endPoint + TIKA_POST_PATH)
+                .type("application/pdf")
+                .accept(MediaType.TEXT_PLAIN).type(MediaType.MULTIPART_FORM_DATA)
+                .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX.toLowerCase(Locale.ROOT)
+                        + "ocrstrategy", "non-sense-value")
+                .post(testPDFLowerCaseOCRConfigPOSTBody());
+        assertEquals(400, response.getStatus());
+    }
+
+    private MultipartBody testPDFLowerCaseOCRConfigPOSTBody() {
+        ContentDisposition cd = new ContentDisposition(
+                "form-data; name=\"input\"; filename=\"testOCR.pdf\"");
+        Attachment att = new Attachment("upload", ClassLoader
+                .getSystemResourceAsStream("test-documents/testOCR.pdf"), cd);
+        return new MultipartBody(att);
+    }
+
 }
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index c83b5a2..e129da4 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -53,6 +53,7 @@ import javax.xml.transform.stream.StreamResult;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.cxf.attachment.ContentDisposition;
 import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.cxf.jaxrs.impl.MetadataMap;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
@@ -231,9 +232,11 @@ public class TikaResource {
             }
 
         } catch (Throwable ex) {
-            throw new WebApplicationException(String.format(Locale.ROOT,
-                    "%s is an invalid %s header or has an invalid value: %s", key, prefix, val),
-                    Response.Status.BAD_REQUEST);
+            // TIKA-3345
+            String error = (!(ex.getCause() instanceof IllegalArgumentException)) ?
+                    String.format(Locale.ROOT, "%s is an invalid %s header", key, prefix) :
+                    String.format(Locale.ROOT, "%s is an invalid %s header value", val, key);
+            throw new WebApplicationException(error, Response.Status.BAD_REQUEST);
         }
     }
 
@@ -375,9 +378,11 @@ public class TikaResource {
     @Consumes("multipart/form-data")
     @Produces("text/plain")
     @Path("form")
-    public StreamingOutput getTextFromMultipart(Attachment att, @Context final UriInfo info) {
-        return produceText(att.getObject(InputStream.class), new Metadata(), att.getHeaders(),
-                info);
+    public StreamingOutput getTextFromMultipart(Attachment att,
+                                                @Context HttpHeaders httpHeaders,
+                                                @Context final UriInfo info) {
+        return produceText(att.getObject(InputStream.class), new Metadata(),
+                preparePostHeaderMap(att, httpHeaders), info);
     }
 
     //this is equivalent to text-main in tika-app
@@ -396,8 +401,10 @@ public class TikaResource {
     @Produces("text/plain")
     @Path("form/main")
     public StreamingOutput getTextMainFromMultipart(final Attachment att,
+                                                    @Context HttpHeaders httpHeaders,
                                                     @Context final UriInfo info) {
-        return produceTextMain(att.getObject(InputStream.class), att.getHeaders(), info);
+        return produceTextMain(att.getObject(InputStream.class),
+                preparePostHeaderMap(att, httpHeaders), info);
     }
 
     public StreamingOutput produceTextMain(final InputStream is,
@@ -463,9 +470,11 @@ public class TikaResource {
     @Consumes("multipart/form-data")
     @Produces("text/html")
     @Path("form")
-    public StreamingOutput getHTMLFromMultipart(Attachment att, @Context final UriInfo info) {
-        return produceOutput(att.getObject(InputStream.class), new Metadata(), att.getHeaders(),
-                info, "html");
+    public StreamingOutput getHTMLFromMultipart(Attachment att,
+                                                @Context HttpHeaders httpHeaders,
+                                                @Context final UriInfo info) {
+        return produceOutput(att.getObject(InputStream.class), new Metadata(),
+                preparePostHeaderMap(att, httpHeaders), info, "html");
     }
 
     @PUT
@@ -482,9 +491,11 @@ public class TikaResource {
     @Consumes("multipart/form-data")
     @Produces("text/xml")
     @Path("form")
-    public StreamingOutput getXMLFromMultipart(Attachment att, @Context final UriInfo info) {
-        return produceOutput(att.getObject(InputStream.class), new Metadata(), att.getHeaders(),
-                info, "xml");
+    public StreamingOutput getXMLFromMultipart(Attachment att,
+                                               @Context HttpHeaders httpHeaders,
+                                               @Context final UriInfo info) {
+        return produceOutput(att.getObject(InputStream.class),
+                new Metadata(), preparePostHeaderMap(att, httpHeaders), info, "xml");
     }
 
     @PUT
@@ -532,4 +543,25 @@ public class TikaResource {
             }
         };
     }
+
+    /**
+     * Prepares a multivalued map, combining attachment headers and request headers.
+     * Gives priority to attachment headers.
+     * @param att the attachment.
+     * @param httpHeaders the http headers, fetched from context.
+     * @return the case insensitive MetadataMap containing combined headers.
+     */
+    private MetadataMap<String, String> preparePostHeaderMap(Attachment att,
+                                                             HttpHeaders httpHeaders) {
+        if (att == null && httpHeaders == null) return null;
+        MetadataMap<String, String> finalHeaders = new MetadataMap<>(false,
+                true);
+        if (httpHeaders != null && httpHeaders.getRequestHeaders() != null) {
+            finalHeaders.putAll(httpHeaders.getRequestHeaders());
+        }
+        if (att != null && att.getHeaders() != null) {
+            finalHeaders.putAll(att.getHeaders());
+        }
+        return finalHeaders;
+    }
 }