You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2017/05/08 13:33:47 UTC
[tika] branch master updated: TIKA-2357: Increased support for
Tesseract PSM up to 13 from Rafael Ferreira
This is an automated email from the ASF dual-hosted git repository.
dmeikle pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 0aaa121 TIKA-2357: Increased support for Tesseract PSM up to 13 from Rafael Ferreira
0aaa121 is described below
commit 0aaa1215fd11632c349e9bdebac9829578276cb1
Author: David Meikle <da...@logicalspark.com>
AuthorDate: Mon May 8 14:32:19 2017 +0100
TIKA-2357: Increased support for Tesseract PSM up to 13 from Rafael Ferreira
---
CHANGES.txt | 3 +++
.../src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java | 4 ++--
.../test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java | 2 +-
3 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index ce0e247..416311e 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -107,6 +107,9 @@ Release 1.15 - ??
* Further mime magic for WebVTT (TIKA-1772)
+ * Extend support for increased PSM options up to 13 for modern
+ versions of Tesseract (TIKA-2357).
+
Release 1.14 - 10/19/2016
* Extract all headers from MSG/RFC822 (TIKA-2122).
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index e861876..624c97e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -244,8 +244,8 @@ public class TesseractOCRConfig implements Serializable {
* Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection)
*/
public void setPageSegMode(String pageSegMode) {
- if (!pageSegMode.matches("[1-9]|10")) {
- throw new IllegalArgumentException("Invalid language code");
+ if (!pageSegMode.matches("[0-9]|10|11|12|13")) {
+ throw new IllegalArgumentException("Invalid page segmentation mode");
}
this.pageSegMode = pageSegMode;
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
index fcdd271..adec5db 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
@@ -129,7 +129,7 @@ public class TesseractOCRConfigTest extends TikaTest {
config.setPageSegMode("0");
config.setPageSegMode("10");
assertTrue("Couldn't set valid values", true);
- config.setPageSegMode("11");
+ config.setPageSegMode("14");
}
@Test(expected=IllegalArgumentException.class)
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].