You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2017/11/30 23:43:11 UTC
[tika] branch TIKA-2385 updated: Merge branch 'TIKA-2835' of
https://github.com/pmweiss/tika into TIKA-2385
This is an automated email from the ASF dual-hosted git repository.
dmeikle pushed a commit to branch TIKA-2385
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-2385 by this push:
new 537cfca Merge branch 'TIKA-2835' of https://github.com/pmweiss/tika into TIKA-2385
537cfca is described below
commit 537cfca9eae17a0270388810a557a8dc7faa0c30
Author: David Meikle <da...@meikle.io>
AuthorDate: Thu Nov 30 23:43:01 2017 +0000
Merge branch 'TIKA-2835' of https://github.com/pmweiss/tika into TIKA-2385
Added check for Python dependencies to provide full check before attempting to run
---
.../apache/tika/parser/ocr/TesseractOCRParser.java | 27 +++++++++++-----------
.../tika/parser/ocr/TesseractOCRParserTest.java | 15 ++++++++++++
2 files changed, 28 insertions(+), 14 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 2f85bc0..0b17393 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -170,21 +170,20 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
}
static boolean hasPython() {
- // check if python is installed and if the rotation program path has been specified correctly
-
- boolean hasPython = false;
-
- try {
- Process proc = Runtime.getRuntime().exec("python -h");
- BufferedReader stdInput = new BufferedReader(new InputStreamReader(proc.getInputStream(), "UTF-8"));
- if(stdInput.read() != -1) {
- hasPython = true;
- }
- } catch (IOException e) {
-
- }
+ // check if python is installed, it has the required dependencies for the rotation program to run
+ boolean hasPython = false;
+ DefaultExecutor executor = new DefaultExecutor();
+ CommandLine cmdLine = CommandLine.parse("python -c \"import numpy, matplotlib, skimage;\"");
+ try {
+ int returnCode = executor.execute(cmdLine);
+ if (returnCode != -1) {
+ hasPython = true;
+ }
- return hasPython;
+ } catch(Exception e) {
+ // Do nothing
+ }
+ return hasPython;
}
public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 63d9c96..b4648d1 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -287,4 +287,19 @@ public class TesseractOCRParserTest extends TikaTest {
String xml = getXML("testTIFF_multipage.tif").xml;
assertContains("Page 2", xml);
}
+
+ @Test
+ public void testRotatedOCR() throws Exception {
+ if (TesseractOCRParser.hasPython()) {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ config.setApplyRotation(true);
+ config.setEnableImageProcessing(1);
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, config);
+ assumeTrue(canRun(config));
+
+ String ocr = getText(getResourceAsStream("/test-documents/testRotated.png"), new AutoDetectParser(), parseContext);
+ assertContains("Its had resolving otherwise she contented therefore", ocr);
+ }
+ }
}
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].