You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/12 15:52:20 UTC

[tika] branch master updated: TIKA-3071 -- OCR on rendered PDF pages should work in /unpack endpoint in tika-server

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 00eba49  TIKA-3071 -- OCR on rendered PDF pages should work in /unpack endpoint in tika-server
00eba49 is described below

commit 00eba49664049acc2b50889bff30308ff76d7342
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 12 11:51:39 2020 -0400

    TIKA-3071 -- OCR on rendered PDF pages should work in /unpack endpoint in tika-server
---
 .../org/apache/tika/server/resource/UnpackerResource.java |  4 +++-
 .../java/org/apache/tika/server/UnpackerResourceTest.java | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
index 06eef3d..0789060 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
@@ -132,7 +132,9 @@ public class UnpackerResource {
         TikaResource.fillParseContext(pc, httpHeaders.getRequestHeaders(), null);
         TikaResource.fillMetadata(parser, metadata, pc, httpHeaders.getRequestHeaders());
         TikaResource.logRequest(LOG, info, metadata);
-
+        //even though we aren't currently parsing embedded documents,
+        //we need to add this to allow for "inline" use of other parsers.
+        pc.set(Parser.class, parser);
         ContentHandler ch;
         ByteArrayOutputStream text = new ByteArrayOutputStream();
 
diff --git a/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java
index 350257c..039d28f 100644
--- a/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java
@@ -33,10 +33,13 @@ import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
 import org.apache.cxf.jaxrs.client.WebClient;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.server.resource.TikaResource;
 import org.apache.tika.server.writer.TarWriter;
 import org.apache.tika.server.resource.UnpackerResource;
 import org.apache.tika.server.writer.ZipWriter;
+import org.junit.Assume;
 import org.junit.Test;
 
 public class UnpackerResourceTest extends CXFTestBase {
@@ -212,4 +215,16 @@ public class UnpackerResourceTest extends CXFTestBase {
                 //Java 11 -- underlying image libraries generate a diff image in Java 11
                 md5.equals("58b8269d1a584b7e8c1adcb936123923"));
     }
+
+    @Test
+    public void testPDFRenderOCR() throws Exception {
+        Assume.assumeTrue( new TesseractOCRParser().hasTesseract(new TesseractOCRConfig()));
+
+        Response response = WebClient.create(endPoint + ALL_PATH)
+                .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX+"ocrStrategy", "ocr_only")
+                .accept("application/zip")
+                .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
+        String txt = readArchiveText((InputStream)response.getEntity());
+        assertContains("Happy New Year", txt);
+    }
 }