You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/12 15:52:20 UTC
[tika] branch master updated: TIKA-3071 -- OCR on rendered PDF
pages should work in /unpack endpoint in tika-server
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 00eba49 TIKA-3071 -- OCR on rendered PDF pages should work in /unpack endpoint in tika-server
00eba49 is described below
commit 00eba49664049acc2b50889bff30308ff76d7342
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 12 11:51:39 2020 -0400
TIKA-3071 -- OCR on rendered PDF pages should work in /unpack endpoint in tika-server
---
.../org/apache/tika/server/resource/UnpackerResource.java | 4 +++-
.../java/org/apache/tika/server/UnpackerResourceTest.java | 15 +++++++++++++++
2 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
index 06eef3d..0789060 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/UnpackerResource.java
@@ -132,7 +132,9 @@ public class UnpackerResource {
TikaResource.fillParseContext(pc, httpHeaders.getRequestHeaders(), null);
TikaResource.fillMetadata(parser, metadata, pc, httpHeaders.getRequestHeaders());
TikaResource.logRequest(LOG, info, metadata);
-
+ //even though we aren't currently parsing embedded documents,
+ //we need to add this to allow for "inline" use of other parsers.
+ pc.set(Parser.class, parser);
ContentHandler ch;
ByteArrayOutputStream text = new ByteArrayOutputStream();
diff --git a/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java
index 350257c..039d28f 100644
--- a/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java
@@ -33,10 +33,13 @@ import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.server.resource.TikaResource;
import org.apache.tika.server.writer.TarWriter;
import org.apache.tika.server.resource.UnpackerResource;
import org.apache.tika.server.writer.ZipWriter;
+import org.junit.Assume;
import org.junit.Test;
public class UnpackerResourceTest extends CXFTestBase {
@@ -212,4 +215,16 @@ public class UnpackerResourceTest extends CXFTestBase {
//Java 11 -- underlying image libraries generate a diff image in Java 11
md5.equals("58b8269d1a584b7e8c1adcb936123923"));
}
+
+ @Test
+ public void testPDFRenderOCR() throws Exception {
+ Assume.assumeTrue( new TesseractOCRParser().hasTesseract(new TesseractOCRConfig()));
+
+ Response response = WebClient.create(endPoint + ALL_PATH)
+ .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX+"ocrStrategy", "ocr_only")
+ .accept("application/zip")
+ .put(ClassLoader.getSystemResourceAsStream("testOCR.pdf"));
+ String txt = readArchiveText((InputStream)response.getEntity());
+ assertContains("Happy New Year", txt);
+ }
}