You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@tika.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2017/12/01 00:02:00 UTC

[jira] [Commented] (TIKA-2385) Tesseract OCR rotation.py not run

    [ https://issues.apache.org/jira/browse/TIKA-2385?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16273690#comment-16273690 ] 

ASF GitHub Bot commented on TIKA-2385:
--------------------------------------

dameikle closed pull request #183: fix for TIKA-2385 contributed by pmweiss5
URL: https://github.com/apache/tika/pull/183
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 624c97e39..c8c8bc93e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -94,6 +94,9 @@
     // whether or not to preserve interword spacing
     private boolean preserveInterwordSpacing = false;
 
+    // whether or not to apply rotation calculated by the rotation.py script
+    private boolean applyRotation = false;
+
 
     /**
      * Default contructor.
@@ -169,6 +172,8 @@ private void init(InputStream is) {
                 getProp(props, "filter", getFilter()));
         setResize(
                 getProp(props, "resize", getResize()));
+        setApplyRotation(
+        		getProp(props, "applyRotation", getApplyRotation()));
 
     }
 
@@ -472,6 +477,23 @@ public void setImageMagickPath(String ImageMagickPath) {
         this.ImageMagickPath = ImageMagickPath;
     }
 
+    /**
+     * @return Whether or not a rotation value should be calculated and passed to ImageMagick before performing OCR.
+     * (Requires that Python is installed).
+     */
+    public boolean getApplyRotation() {
+    	return this.applyRotation;
+    }
+
+    /**
+     * Sets whether or not a rotation value should be calculated and passed to ImageMagick.
+     * 
+     * @param true to calculate and apply rotation, false to skip.  Default is false, true required Python installed.
+     */
+    public void setApplyRotation(boolean applyRotation) {
+    	this.applyRotation = applyRotation;
+    }
+
     /**
      * Get property from the properties file passed in.
      *
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 121e096e1..c28f6e1aa 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -170,7 +170,7 @@ private boolean hasImageMagick(TesseractOCRConfig config) {
      
     }
     
-    private static boolean hasPython() {
+    static boolean hasPython() {
     	// check if python is installed and if the rotation program path has been specified correctly
         
     	boolean hasPython = false;
@@ -321,7 +321,7 @@ private void processImage(File streamingObject, TesseractOCRConfig config) throw
         
         // determine the angle of rotation required to make the text horizontal
         CommandLine cmdLine = CommandLine.parse(cmd);
-        if(hasPython()) {
+        if(config.getApplyRotation() && hasPython()) {
             try {
                 executor.execute(cmdLine);
                 angle = outputStream.toString("UTF-8").trim();
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
index 26b6031bc..73c9083ce 100644
--- a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
@@ -32,4 +32,5 @@ density=300
 depth=4
 colorspace=gray
 filter=triangle
-resize=900
\ No newline at end of file
+resize=900
+applyRotation=false
\ No newline at end of file
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py
index fb391f1fe..c619325e3 100644
--- a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/rotation.py
@@ -46,12 +46,12 @@ def main(argv):
 	     filename = arg
 
 	try:
-      from parabolic import parabolic
+	  from parabolic import parabolic
 
 	  def argmax(x):
-	   	 return parabolic(x, numpy.argmax(x))[0]
-	  except ImportError:
-         from numpy import argmax
+	   	return parabolic(x, numpy.argmax(x))[0]
+	except ImportError:
+	  from numpy import argmax
 
 	# Load file, converting to grayscale
 	I = asarray(Image.open(filename).convert('L'))
@@ -69,4 +69,4 @@ def argmax(x):
 	print('{:.2f}'.format(-(90-rotation)))
 
 if __name__ == "__main__":
-	main(sys.argv[1:])
\ No newline at end of file
+	main(sys.argv[1:])
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
index adec5dbc8..8e22f21aa 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
@@ -47,6 +47,7 @@ public void testNoConfig() throws Exception {
         assertEquals("Invalid default colorpsace value", "gray" , config.getColorspace());
         assertEquals("Invalid default filter value", "triangle" , config.getFilter());
         assertEquals("Invalid default resize value", 900 , config.getResize());
+        assertEquals("Invalid default applyRotation value", false, config.getApplyRotation());
     }
 
     @Test
@@ -68,6 +69,7 @@ public void testPartialConfig() throws Exception {
         assertEquals("Invalid overridden depth value", 8 , config.getDepth());
         assertEquals("Invalid overridden filter value", "box" , config.getFilter());	
         assertEquals("Invalid overridden resize value", 300 , config.getResize());
+        assertEquals("Invalid default applyRotation value", false, config.getApplyRotation());
     }
 
     @Test
@@ -91,6 +93,7 @@ public void testFullConfig() throws Exception {
         assertEquals("Invalid overridden depth value", 8 , config.getDepth());
         assertEquals("Invalid overridden filter value", "box" , config.getFilter());
         assertEquals("Invalid overridden resize value", 300 , config.getResize());
+        assertEquals("Invalid overridden applyRotation value", true, config.getApplyRotation());
     }
 
     @Test
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 4c0ab76d7..4b210e10b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -278,4 +278,20 @@ public void testInterwordSpacing() throws Exception {
         Matcher m = Pattern.compile("The\\s{5,20}quick").matcher(xml);
         assertTrue(m.find());
     }
+
+    @Test
+    public void testRotatedOCR() throws Exception {
+    	if (TesseractOCRParser.hasPython()) {
+    	
+    		TesseractOCRConfig config = new TesseractOCRConfig();
+    		config.setApplyRotation(true);
+    		config.setEnableImageProcessing(1);
+    		ParseContext parseContext = new ParseContext();
+    		parseContext.set(TesseractOCRConfig.class, config);
+    		assumeTrue(canRun(config));
+    		
+    		String ocr = getText(getResourceAsStream("/test-documents/testRotated.png"), new AutoDetectParser(), parseContext);
+    		assertContains("Its had resolving otherwise she contented therefore", ocr);
+    	}
+    }
 }
diff --git a/tika-parsers/src/test/resources/test-documents/testRotated.png b/tika-parsers/src/test/resources/test-documents/testRotated.png
new file mode 100644
index 000000000..f535b5017
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testRotated.png differ
diff --git a/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties b/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties
index 3a96ef192..ddc54b99a 100644
--- a/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties
+++ b/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties
@@ -25,4 +25,5 @@ ImageMagickPath=/usr/local/bin
 density=200
 depth=8
 filter=box
-resize=300
\ No newline at end of file
+resize=300
+applyRotation=true
\ No newline at end of file


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> Tesseract OCR rotation.py not run
> ---------------------------------
>
>                 Key: TIKA-2385
>                 URL: https://issues.apache.org/jira/browse/TIKA-2385
>             Project: Tika
>          Issue Type: Bug
>          Components: ocr
>    Affects Versions: 1.15
>            Reporter: Peter Weiss
>            Assignee: Dave Meikle
>             Fix For: 1.17
>
>
> It appears that even if Python is installed, the rotation.py that calculates rotation angle of the image does not run because of indentation/spacing errors in the Python script.
> Also recommend making this a configurable parameter since it does add time and can produce unexpected results if the supplied image contains more than just plain text.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)