You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2015/08/05 03:33:14 UTC
svn commit: r1694133 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/ocr/
tika-parsers/src/test/java/org/apache/tika/parser/ocr/
tika-parsers/src/test/resources/test-properties/
Author: mattmann
Date: Wed Aug 5 01:33:13 2015
New Revision: 1694133
URL: http://svn.apache.org/r1694133
Log:
Fix for TIKA-1703: Can't Specify Tesseract Data Folder Distinct from Tesseract Executable Path Contributed by Christian Wolfe <ta...@gmail.com> this closes #56.
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Aug 5 01:33:13 2015
@@ -1,3 +1,9 @@
+Release 1.11 - Current Development
+
+ * The ability to specify the Tesseract Config Path was added
+ to the OCR Parser (TIKA-1703).
+
+
Release 1.10 - 8/1/2015
* Tika Config XML can now be used to create composite detectors,
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java Wed Aug 5 01:33:13 2015
@@ -25,7 +25,7 @@ import java.util.Properties;
/**
* Configuration for TesseractOCRParser.
- *
+ *
* This allows to enable TesseractOCRParser and set its parameters:
* <p>
* TesseractOCRConfig config = new TesseractOCRConfig();<br>
@@ -36,27 +36,30 @@ import java.util.Properties;
* Parameters can also be set by either editing the existing TesseractOCRConfig.properties file in,
* tika-parser/src/main/resources/org/apache/tika/parser/ocr, or overriding it by creating your own
* and placing it in the package org/apache/tika/parser/ocr on the classpath.
- *
+ *
*/
public class TesseractOCRConfig implements Serializable{
private static final long serialVersionUID = -4861942486845757891L;
-
+
// Path to tesseract installation folder, if not on system path.
private String tesseractPath = "";
-
+
+ // Path to the 'tessdata' folder, which contains language files and config files.
+ private String tessdataPath = "";
+
// Language dictionary to be used.
private String language = "eng";
-
+
// Tesseract page segmentation mode.
private String pageSegMode = "1";
-
+
// Minimum file size to submit file to ocr.
private int minFileSizeToOcr = 0;
-
+
// Maximum file size to submit file to ocr.
private int maxFileSizeToOcr = Integer.MAX_VALUE;
-
+
// Maximum time (seconds) to wait for the ocring process termination
private int timeout = 120;
@@ -98,6 +101,8 @@ public class TesseractOCRConfig implemen
setTesseractPath(
getProp(props, "tesseractPath", getTesseractPath()));
+ setTessdataPath(
+ getProp(props, "tessdataPath", getTessdataPath()));
setLanguage(
getProp(props, "language", getLanguage()));
setPageSegMode(
@@ -107,7 +112,7 @@ public class TesseractOCRConfig implemen
setMaxFileSizeToOcr(
getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr()));
setTimeout(
- getProp(props, "timeout", getTimeout()));
+ getProp(props, "timeout", getTimeout()));
}
@@ -115,22 +120,43 @@ public class TesseractOCRConfig implemen
public String getTesseractPath() {
return tesseractPath;
}
-
+
/**
- * Set tesseract installation folder, needed if it is not on system path.
+ * Set the path to the Tesseract executable, needed if it is not on system path.
+ * <p>
+ * Note that if you set this value, it is highly recommended that you also
+ * set the path to the 'tessdata' folder using {@link #setTessdataPath}.
+ * </p>
*/
public void setTesseractPath(String tesseractPath) {
if(!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator))
tesseractPath += File.separator;
-
+
this.tesseractPath = tesseractPath;
}
-
+
+ /** @see #setTessdataPath(String tessdataPath) */
+ public String getTessdataPath() {
+ return tessdataPath;
+ }
+
+ /**
+ * Set the path to the 'tessdata' folder, which contains language files and config files. In some cases (such
+ * as on Windows), this folder is found in the Tesseract installation, but in other cases
+ * (such as when Tesseract is built from source), it may be located elsewhere.
+ */
+ public void setTessdataPath(String tessdataPath) {
+ if(!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator))
+ tessdataPath += File.separator;
+
+ this.tessdataPath = tessdataPath;
+ }
+
/** @see #setLanguage(String language)*/
public String getLanguage() {
return language;
}
-
+
/**
* Set tesseract language dictionary to be used. Default is "eng".
* Multiple languages may be specified, separated by plus characters.
@@ -141,12 +167,12 @@ public class TesseractOCRConfig implemen
}
this.language = language;
}
-
+
/** @see #setPageSegMode(String pageSegMode)*/
public String getPageSegMode() {
return pageSegMode;
}
-
+
/**
* Set tesseract page segmentation mode.
* Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection)
@@ -157,12 +183,12 @@ public class TesseractOCRConfig implemen
}
this.pageSegMode = pageSegMode;
}
-
+
/** @see #setMinFileSizeToOcr(int minFileSizeToOcr)*/
public int getMinFileSizeToOcr() {
return minFileSizeToOcr;
}
-
+
/**
* Set minimum file size to submit file to ocr.
* Default is 0.
@@ -170,12 +196,12 @@ public class TesseractOCRConfig implemen
public void setMinFileSizeToOcr(int minFileSizeToOcr) {
this.minFileSizeToOcr = minFileSizeToOcr;
}
-
+
/** @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)*/
public int getMaxFileSizeToOcr() {
return maxFileSizeToOcr;
}
-
+
/**
* Set maximum file size to submit file to ocr.
* Default is Integer.MAX_VALUE.
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Wed Aug 5 01:33:13 2015
@@ -97,9 +97,14 @@ public class TesseractOCRParser extends
}
private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
- if (!config.getTesseractPath().isEmpty()) {
- Map<String, String> env = pb.environment();
- env.put("TESSDATA_PREFIX", config.getTesseractPath());
+ String tessdataPrefix = "TESSDATA_PREFIX";
+ Map<String, String> env = pb.environment();
+
+ if (!config.getTessdataPath().isEmpty()) {
+ env.put(tessdataPrefix, config.getTessdataPath());
+ }
+ else if(!config.getTesseractPath().isEmpty()) {
+ env.put(tessdataPrefix, config.getTesseractPath());
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java Wed Aug 5 01:33:13 2015
@@ -32,6 +32,7 @@ public class TesseractOCRConfigTest exte
public void testNoConfig() throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
+ assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
assertEquals("Invalid default language value", "eng", config.getLanguage());
assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
assertEquals("Invalid default minFileSizeToOcr value", 0, config.getMinFileSizeToOcr());
@@ -47,6 +48,7 @@ public class TesseractOCRConfigTest exte
TesseractOCRConfig config = new TesseractOCRConfig(stream);
assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
+ assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
@@ -62,6 +64,7 @@ public class TesseractOCRConfigTest exte
TesseractOCRConfig config = new TesseractOCRConfig(stream);
assertEquals("Invalid overridden tesseractPath value", "/opt/tesseract" + File.separator, config.getTesseractPath());
+ assertEquals("Invalid overridden tesseractPath value", "/usr/local/share" + File.separator, config.getTessdataPath());
assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
assertEquals("Invalid overridden pageSegMode value", "2", config.getPageSegMode());
assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
Modified: tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties?rev=1694133&r1=1694132&r2=1694133&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties (original)
+++ tika/trunk/tika-parsers/src/test/resources/test-properties/TesseractOCRConfig-full.properties Wed Aug 5 01:33:13 2015
@@ -14,6 +14,7 @@
# limitations under the License.
tesseractPath=/opt/tesseract
+tessdataPath=/usr/local/share
language=fra+deu
pageSegMode=2
maxFileSizeToOcr=2000000
Re: svn commit: r1694133 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/ocr/
tika-parsers/src/test/java/org/apache/tika/parser/ocr/ tika-parsers/src/test/resources/test-properties/
Posted by Christian Wolfe <ta...@gmail.com>.
Sure, I can create a patch file that has all the changes without the
formatting issues, and I'll attach it to the original ticket.
On Wed, Aug 5, 2015 at 4:40 AM, Nick Burch <ap...@gagravarr.org> wrote:
> On Wed, 5 Aug 2015, mattmann@apache.org wrote:
>
>> // Path to tesseract installation folder, if not on system path.
>> private String tesseractPath = "";
>> -
>> +
>> + // Path to the 'tessdata' folder, which contains language files and
>> config files.
>> + private String tessdataPath = "";
>> +
>> // Language dictionary to be used.
>> private String language = "eng";
>>
>
> Seems to be some inconsistent indents going on here. Any chance you could
> reformat the patch and/or classes to match
> http://tika.apache.org/contribute.html#Code_Formatting ?
>
> Thanks
> Nick
>
Re: svn commit: r1694133 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/ocr/
tika-parsers/src/test/java/org/apache/tika/parser/ocr/
tika-parsers/src/test/resources/test-properties/
Posted by Nick Burch <ap...@gagravarr.org>.
On Wed, 5 Aug 2015, mattmann@apache.org wrote:
> // Path to tesseract installation folder, if not on system path.
> private String tesseractPath = "";
> -
> +
> + // Path to the 'tessdata' folder, which contains language files and config files.
> + private String tessdataPath = "";
> +
> // Language dictionary to be used.
> private String language = "eng";
Seems to be some inconsistent indents going on here. Any chance you could
reformat the patch and/or classes to match
http://tika.apache.org/contribute.html#Code_Formatting ?
Thanks
Nick