You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/20 21:32:55 UTC

[4/4] tika git commit: TIKA-2190 -- add configurability for preserve interword spacing

TIKA-2190 -- add configurability for preserve interword spacing


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ae44b9e5
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ae44b9e5
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ae44b9e5

Branch: refs/heads/master
Commit: ae44b9e507dbb11b9b9f5c57cf342b47966ffb66
Parents: c83f87b
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 20 16:32:44 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 20 16:32:44 2016 -0500

----------------------------------------------------------------------
 CHANGES.txt                                     |   3 ++
 .../tika/parser/ocr/TesseractOCRConfig.java     |  38 +++++++++++++++++++
 .../tika/parser/ocr/TesseractOCRParser.java     |  10 +++--
 .../parser/ocr/TesseractOCRConfig.properties    |   1 +
 .../tika/parser/ocr/TesseractOCRParserTest.java |  22 +++++++++++
 .../test-documents/testOCR_spacing.png          | Bin 0 -> 7232 bytes
 6 files changed, 71 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/ae44b9e5/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index e215499..bd6a45e 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.15 - ??
 
+  * Add configurability of "preserve-interword-spacing" to
+    TesseractOCRParser (TIKA-2190).
+
   * Added experimental SAX parser for .pptx files. To select this parser,
     set useSAXPptxExtractor(true) on OfficeParserConfig (TIKA-2210).
 

http://git-wip-us.apache.org/repos/asf/tika/blob/ae44b9e5/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index a08b0e2..27853d3 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -91,6 +91,10 @@ public class TesseractOCRConfig implements Serializable {
     // factor by which image is to be scaled.
     private int resize = 900;
 
+    // whether or not to preserve interword spacing
+    private boolean preserveInterwordSpacing = false;
+
+
     /**
      * Default contructor.
      */
@@ -148,6 +152,7 @@ public class TesseractOCRConfig implements Serializable {
         } else if ("hocr".equals(outputTypeString)) {
             setOutputType(OUTPUT_TYPE.HOCR);
         }
+        setPreserveInterwordSpacing(getProp(props, "preserveInterwordSpacing", false));
 
         // set parameters for ImageMagick
         setEnableImageProcessing(
@@ -244,6 +249,22 @@ public class TesseractOCRConfig implements Serializable {
     }
 
     /**
+     * Whether or not to maintain interword spacing.  Default is <code>false</code>.
+     *
+     * @param preserveInterwordSpacing
+     */
+    public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing) {
+        this.preserveInterwordSpacing = preserveInterwordSpacing;
+    }
+
+    /**
+     *
+     * @return whether or not to maintain interword spacing.
+     */
+    public boolean getPreserveInterwordSpacing() {
+        return preserveInterwordSpacing;
+    }
+    /**
      * @see #setMinFileSizeToOcr(int minFileSizeToOcr)
      */
     public int getMinFileSizeToOcr() {
@@ -481,4 +502,21 @@ public class TesseractOCRConfig implements Serializable {
     private String getProp(Properties properties, String property, String defaultMissing) {
         return properties.getProperty(property, defaultMissing);
     }
+
+    private boolean getProp(Properties properties, String property, boolean defaultMissing) {
+        String propVal = properties.getProperty(property);
+        if (propVal == null) {
+            return defaultMissing;
+        }
+        if (propVal.equalsIgnoreCase("true")) {
+            return true;
+        } else if (propVal.equalsIgnoreCase("false")) {
+            return false;
+        }
+
+        throw new RuntimeException(String.format(Locale.ROOT,
+                "Cannot parse TesseractOCRConfig variable %s, invalid boolean value: %s",
+                property, propVal));
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/ae44b9e5/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index ffbef1c..46a3f55 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -417,9 +417,12 @@ public class TesseractOCRParser extends AbstractParser {
      *           if an input error occurred
      */
     private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
-        String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
-                config.getLanguage(), "-psm", config.getPageSegMode(), config.getOutputType().name().toLowerCase(Locale.US)};
 
+        String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
+                config.getLanguage(), "-psm", config.getPageSegMode(),
+                config.getOutputType().name().toLowerCase(Locale.US),
+                "-c",
+                (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"};
         ProcessBuilder pb = new ProcessBuilder(cmd);
         setEnv(config, pb);
         final Process process = pb.start();
@@ -479,8 +482,9 @@ public class TesseractOCRParser extends AbstractParser {
         try (Reader reader = new InputStreamReader(stream, UTF_8)) {
             char[] buffer = new char[1024];
             for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
-                if (n > 0)
+                if (n > 0) {
                     xhtml.characters(buffer, 0, n);
+                }
             }
         }
         xhtml.endElement("div");

http://git-wip-us.apache.org/repos/asf/tika/blob/ae44b9e5/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
index ddcd473..3cc2bcd 100644
--- a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
@@ -21,6 +21,7 @@ maxFileSizeToOcr=2147483647
 minFileSizeToOcr=0
 timeout=120
 outputType=txt
+preserveInterwordSpacing=false
 
 # properties for image processing
 # to enable processing, set enableImageProcessing to 1

http://git-wip-us.apache.org/repos/asf/tika/blob/ae44b9e5/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index e0f89ac..4c0ab76 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -23,6 +23,8 @@ import static org.junit.Assume.assumeTrue;
 
 import java.io.InputStream;
 import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
@@ -256,4 +258,24 @@ public class TesseractOCRParserTest extends TikaTest {
     }
 
     //TODO: add unit tests for jp2/jpx/ppm TIKA-2174
+
+    @Test
+    public void testInterwordSpacing() throws Exception {
+        assumeTrue(canRun());
+        //default
+        String xml = getXML("testOCR_spacing.png").xml;
+        assertContains("The quick", xml);
+
+        TesseractOCRConfig tesseractOCRConfigconfig = new TesseractOCRConfig();
+        tesseractOCRConfigconfig.setPreserveInterwordSpacing(true);
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(TesseractOCRConfig.class, tesseractOCRConfigconfig);
+
+        //with preserve interwordspacing "on"
+        //allow some flexibility in case Tesseract is computing spaces
+        //somewhat differently in different versions/OS's, etc.
+        xml = getXML("testOCR_spacing.png", parseContext).xml;
+        Matcher m = Pattern.compile("The\\s{5,20}quick").matcher(xml);
+        assertTrue(m.find());
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/ae44b9e5/tika-parsers/src/test/resources/test-documents/testOCR_spacing.png
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testOCR_spacing.png b/tika-parsers/src/test/resources/test-documents/testOCR_spacing.png
new file mode 100644
index 0000000..10a7e40
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testOCR_spacing.png differ