You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2017/11/30 23:43:11 UTC

[tika] branch TIKA-2385 updated: Merge branch 'TIKA-2835' of https://github.com/pmweiss/tika into TIKA-2385

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch TIKA-2385
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-2385 by this push:
     new 537cfca  Merge branch 'TIKA-2835' of https://github.com/pmweiss/tika into TIKA-2385
537cfca is described below

commit 537cfca9eae17a0270388810a557a8dc7faa0c30
Author: David Meikle <da...@meikle.io>
AuthorDate: Thu Nov 30 23:43:01 2017 +0000

    Merge branch 'TIKA-2835' of https://github.com/pmweiss/tika into TIKA-2385
    
    Added check for Python dependencies to provide full check before attempting to run
---
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 27 +++++++++++-----------
 .../tika/parser/ocr/TesseractOCRParserTest.java    | 15 ++++++++++++
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 2f85bc0..0b17393 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -170,21 +170,20 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
     }
     
     static boolean hasPython() {
-    	// check if python is installed and if the rotation program path has been specified correctly
-        
-    	boolean hasPython = false;
-    	
-		try {
-			Process proc = Runtime.getRuntime().exec("python -h");
-			BufferedReader stdInput = new BufferedReader(new InputStreamReader(proc.getInputStream(), "UTF-8"));
-			if(stdInput.read() != -1) {
-				hasPython = true;
-			}
-		} catch (IOException e) {
-
-		} 
+    	// check if python is installed, it has the required dependencies for the rotation program to run
+        boolean hasPython = false;
+        DefaultExecutor executor = new DefaultExecutor();
+        CommandLine cmdLine = CommandLine.parse("python -c \"import numpy, matplotlib, skimage;\"");
+        try {
+            int returnCode = executor.execute(cmdLine);
+            if (returnCode != -1) {
+                hasPython = true;
+            }
 
-		return hasPython;	
+        } catch(Exception e) {
+            // Do nothing
+        }
+        return hasPython;
     }
     
     public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 63d9c96..b4648d1 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -287,4 +287,19 @@ public class TesseractOCRParserTest extends TikaTest {
         String xml = getXML("testTIFF_multipage.tif").xml;
         assertContains("Page 2", xml);
     }
+
+    @Test
+    public void testRotatedOCR() throws Exception {
+        if (TesseractOCRParser.hasPython()) {
+            TesseractOCRConfig config = new TesseractOCRConfig();
+            config.setApplyRotation(true);
+            config.setEnableImageProcessing(1);
+            ParseContext parseContext = new ParseContext();
+            parseContext.set(TesseractOCRConfig.class, config);
+            assumeTrue(canRun(config));
+
+            String ocr = getText(getResourceAsStream("/test-documents/testRotated.png"), new AutoDetectParser(), parseContext);
+            assertContains("Its had resolving otherwise she contented therefore", ocr);
+        }
+    }
 }

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].