You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2017/05/08 13:33:47 UTC

[tika] branch master updated: TIKA-2357: Increased support for Tesseract PSM up to 13 from Rafael Ferreira

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/master by this push:
       new  0aaa121   TIKA-2357: Increased support for Tesseract PSM up to 13 from Rafael Ferreira
0aaa121 is described below

commit 0aaa1215fd11632c349e9bdebac9829578276cb1
Author: David Meikle <da...@logicalspark.com>
AuthorDate: Mon May 8 14:32:19 2017 +0100

    TIKA-2357: Increased support for Tesseract PSM up to 13 from Rafael Ferreira
---
 CHANGES.txt                                                           | 3 +++
 .../src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java  | 4 ++--
 .../test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java  | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index ce0e247..416311e 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -107,6 +107,9 @@ Release 1.15 - ??
 
   * Further mime magic for WebVTT (TIKA-1772)
 
+  * Extend support for increased PSM options up to 13 for modern 
+    versions of Tesseract (TIKA-2357).
+
 Release 1.14 - 10/19/2016
 
   * Extract all headers from MSG/RFC822 (TIKA-2122).
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index e861876..624c97e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -244,8 +244,8 @@ public class TesseractOCRConfig implements Serializable {
      * Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection)
      */
     public void setPageSegMode(String pageSegMode) {
-        if (!pageSegMode.matches("[1-9]|10")) {
-            throw new IllegalArgumentException("Invalid language code");
+        if (!pageSegMode.matches("[0-9]|10|11|12|13")) {
+            throw new IllegalArgumentException("Invalid page segmentation mode");
         }
         this.pageSegMode = pageSegMode;
     }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
index fcdd271..adec5db 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
@@ -129,7 +129,7 @@ public class TesseractOCRConfigTest extends TikaTest {
         config.setPageSegMode("0");
         config.setPageSegMode("10");
         assertTrue("Couldn't set valid values", true);
-        config.setPageSegMode("11");
+        config.setPageSegMode("14");
     }
 
     @Test(expected=IllegalArgumentException.class)

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].