You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/08/05 03:33:14 UTC

svn commit: r1694133 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/ocr/ tika-parsers/src/test/java/org/apache/tika/parser/ocr/ tika-parsers/src/test/resources/test-properties/

Author: mattmann
Date: Wed Aug  5 01:33:13 2015
New Revision: 1694133

URL: http://svn.apache.org/r1694133
Log:
Fix for TIKA-1703: Can't Specify Tesseract Data Folder Distinct from Tesseract Executable Path Contributed by Christian Wolfe <ta...@gmail.com> this closes #56.

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
    tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Aug  5 01:33:13 2015
@@ -1,3 +1,9 @@
+Release 1.11 - Current Development
+
+  * The ability to specify the Tesseract Config Path was added
+    to the OCR Parser (TIKA-1703).
+
+
 Release 1.10 - 8/1/2015
 
   * Tika Config XML can now be used to create composite detectors,

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java Wed Aug  5 01:33:13 2015
@@ -25,7 +25,7 @@ import java.util.Properties;
 
 /**
  * Configuration for TesseractOCRParser.
- * 
+ *
  * This allows to enable TesseractOCRParser and set its parameters:
  * <p>
  * TesseractOCRConfig config = new TesseractOCRConfig();<br>
@@ -36,27 +36,30 @@ import java.util.Properties;
  * Parameters can also be set by either editing the existing TesseractOCRConfig.properties file in,
  * tika-parser/src/main/resources/org/apache/tika/parser/ocr, or overriding it by creating your own
  * and placing it in the package org/apache/tika/parser/ocr on the classpath.
- * 
+ *
  */
 public class TesseractOCRConfig implements Serializable{
 
 	private static final long serialVersionUID = -4861942486845757891L;
-	
+
 	// Path to tesseract installation folder, if not on system path.
 	private  String tesseractPath = "";
-	
+
+    // Path to the 'tessdata' folder, which contains language files and config files.
+    private String tessdataPath = "";
+
 	// Language dictionary to be used.
 	private  String language = "eng";
-	
+
 	// Tesseract page segmentation mode.
 	private  String pageSegMode = "1";
-	
+
 	// Minimum file size to submit file to ocr.
 	private  int minFileSizeToOcr = 0;
-	
+
 	// Maximum file size to submit file to ocr.
 	private  int maxFileSizeToOcr = Integer.MAX_VALUE;
-	
+
 	// Maximum time (seconds) to wait for the ocring process termination
 	private int timeout = 120;
 
@@ -98,6 +101,8 @@ public class TesseractOCRConfig implemen
 
 		setTesseractPath(
 				getProp(props, "tesseractPath", getTesseractPath()));
+        setTessdataPath(
+                getProp(props, "tessdataPath", getTessdataPath()));
 		setLanguage(
 				getProp(props, "language", getLanguage()));
 		setPageSegMode(
@@ -107,7 +112,7 @@ public class TesseractOCRConfig implemen
 		setMaxFileSizeToOcr(
 				getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr()));
 		setTimeout(
-				getProp(props, "timeout", getTimeout()));
+                getProp(props, "timeout", getTimeout()));
 
 	}
 
@@ -115,22 +120,43 @@ public class TesseractOCRConfig implemen
 	public String getTesseractPath() {
 		return tesseractPath;
 	}
-	
+
 	/**
-	 * Set tesseract installation folder, needed if it is not on system path.
+	 * Set the path to the Tesseract executable, needed if it is not on system path.
+     * <p>
+     * Note that if you set this value, it is highly recommended that you also
+     * set the path to the 'tessdata' folder using {@link #setTessdataPath}.
+     * </p>
 	 */
 	public void setTesseractPath(String tesseractPath) {
 		if(!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator))
 			tesseractPath += File.separator;
-		
+
 		this.tesseractPath = tesseractPath;
 	}
-	
+
+    /** @see #setTessdataPath(String tessdataPath) */
+    public String getTessdataPath() {
+        return tessdataPath;
+    }
+
+    /**
+     * Set the path to the 'tessdata' folder, which contains language files and config files. In some cases (such
+     * as on Windows), this folder is found in the Tesseract installation, but in other cases
+     * (such as when Tesseract is built from source), it may be located elsewhere.
+     */
+    public void setTessdataPath(String tessdataPath) {
+        if(!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator))
+            tessdataPath += File.separator;
+
+        this.tessdataPath = tessdataPath;
+    }
+
 	/** @see #setLanguage(String language)*/
 	public String getLanguage() {
 		return language;
 	}
-	
+
 	/**
 	 * Set tesseract language dictionary to be used. Default is "eng".
 	 * Multiple languages may be specified, separated by plus characters.
@@ -141,12 +167,12 @@ public class TesseractOCRConfig implemen
 		}
 		this.language = language;
 	}
-	
+
 	/** @see #setPageSegMode(String pageSegMode)*/
 	public String getPageSegMode() {
 		return pageSegMode;
 	}
-	
+
 	/**
 	 * Set tesseract page segmentation mode.
 	 * Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection)
@@ -157,12 +183,12 @@ public class TesseractOCRConfig implemen
 		}
 		this.pageSegMode = pageSegMode;
 	}
-	
+
 	/** @see #setMinFileSizeToOcr(int minFileSizeToOcr)*/
 	public int getMinFileSizeToOcr() {
 		return minFileSizeToOcr;
 	}
-	
+
 	/**
 	 * Set minimum file size to submit file to ocr.
 	 * Default is 0.
@@ -170,12 +196,12 @@ public class TesseractOCRConfig implemen
 	public void setMinFileSizeToOcr(int minFileSizeToOcr) {
 		this.minFileSizeToOcr = minFileSizeToOcr;
 	}
-	
+
 	/** @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)*/
 	public int getMaxFileSizeToOcr() {
 		return maxFileSizeToOcr;
 	}
-	
+
 	/**
 	 * Set maximum file size to submit file to ocr.
 	 * Default is Integer.MAX_VALUE.

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Wed Aug  5 01:33:13 2015
@@ -97,9 +97,14 @@ public class TesseractOCRParser extends
     }
 
     private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
-        if (!config.getTesseractPath().isEmpty()) {
-            Map<String, String> env = pb.environment();
-            env.put("TESSDATA_PREFIX", config.getTesseractPath());
+        String tessdataPrefix = "TESSDATA_PREFIX";
+        Map<String, String> env = pb.environment();
+
+        if (!config.getTessdataPath().isEmpty()) {
+            env.put(tessdataPrefix, config.getTessdataPath());
+        }
+        else if(!config.getTesseractPath().isEmpty()) {
+            env.put(tessdataPrefix, config.getTesseractPath());
         }
     }
 

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java Wed Aug  5 01:33:13 2015
@@ -32,6 +32,7 @@ public class TesseractOCRConfigTest exte
     public void testNoConfig() throws Exception {
         TesseractOCRConfig config = new TesseractOCRConfig();
         assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
+        assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
         assertEquals("Invalid default language value", "eng", config.getLanguage());
         assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
         assertEquals("Invalid default minFileSizeToOcr value", 0, config.getMinFileSizeToOcr());
@@ -47,6 +48,7 @@ public class TesseractOCRConfigTest exte
 
         TesseractOCRConfig config = new TesseractOCRConfig(stream);
         assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
+        assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
         assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
         assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
         assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
@@ -62,6 +64,7 @@ public class TesseractOCRConfigTest exte
 
         TesseractOCRConfig config = new TesseractOCRConfig(stream);
         assertEquals("Invalid overridden tesseractPath value", "/opt/tesseract" + File.separator, config.getTesseractPath());
+        assertEquals("Invalid overridden tesseractPath value", "/usr/local/share" + File.separator, config.getTessdataPath());
         assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
         assertEquals("Invalid overridden pageSegMode value", "2", config.getPageSegMode());
         assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());

Modified: tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties (original)
+++ tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties Wed Aug  5 01:33:13 2015
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 tesseractPath=/opt/tesseract
+tessdataPath=/usr/local/share
 language=fra+deu
 pageSegMode=2
 maxFileSizeToOcr=2000000



Re: svn commit: r1694133 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/ocr/ tika-parsers/src/test/java/org/apache/tika/parser/ocr/ tika-parsers/src/test/resources/test-properties/

Posted by Christian Wolfe <ta...@gmail.com>.
Sure, I can create a patch file that has all the changes without the
formatting issues, and I'll attach it to the original ticket.

On Wed, Aug 5, 2015 at 4:40 AM, Nick Burch <ap...@gagravarr.org> wrote:

> On Wed, 5 Aug 2015, mattmann@apache.org wrote:
>
>>           // Path to tesseract installation folder, if not on system path.
>>           private  String tesseractPath = "";
>> -
>> +
>> +    // Path to the 'tessdata' folder, which contains language files and
>> config files.
>> +    private String tessdataPath = "";
>> +
>>           // Language dictionary to be used.
>>           private  String language = "eng";
>>
>
> Seems to be some inconsistent indents going on here. Any chance you could
> reformat the patch and/or classes to match
> http://tika.apache.org/contribute.html#Code_Formatting ?
>
> Thanks
> Nick
>

Re: svn commit: r1694133 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/ocr/ tika-parsers/src/test/java/org/apache/tika/parser/ocr/ tika-parsers/src/test/resources/test-properties/

Posted by Nick Burch <ap...@gagravarr.org>.
On Wed, 5 Aug 2015, mattmann@apache.org wrote:
> 	  // Path to tesseract installation folder, if not on system path.
> 	  private  String tesseractPath = "";
> -
> +
> +    // Path to the 'tessdata' folder, which contains language files and config files.
> +    private String tessdataPath = "";
> +
> 	  // Language dictionary to be used.
> 	  private  String language = "eng";

Seems to be some inconsistent indents going on here. Any chance you could 
reformat the patch and/or classes to match 
http://tika.apache.org/contribute.html#Code_Formatting ?

Thanks
Nick