You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/20 21:34:18 UTC
[2/2] tika git commit: TIKA 2190 -- add configurability for preserve
interword spacing
TIKA 2190 -- add configurability for preserve interword spacing
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0d30aa1b
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0d30aa1b
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0d30aa1b
Branch: refs/heads/2.x
Commit: 0d30aa1b216c3e047c77df8c0c32cdc648fa5072
Parents: 54154e0
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 20 16:33:30 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 20 16:33:30 2016 -0500
----------------------------------------------------------------------
CHANGES.txt | 3 ++
.../tika/parser/ocr/TesseractOCRConfig.java | 40 +++++++++++++++++++-
.../tika/parser/ocr/TesseractOCRParser.java | 10 +++--
.../parser/ocr/TesseractOCRConfig.properties | 3 +-
.../tika/parser/ocr/TesseractOCRParserTest.java | 22 +++++++++++
5 files changed, 73 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/0d30aa1b/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index daba7df..b34559f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,9 @@ Release 2.0 - ???
Release 1.15 -???
+ * Add configurability of "preserve-interword-spacing" to
+ TesseractOCRParser (TIKA-2190).
+
* Allow extraction of PDActions (including Javascript) from
PDFs (TIKA-2090).
http://git-wip-us.apache.org/repos/asf/tika/blob/0d30aa1b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 5d06a7a..c39ac83 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -71,6 +71,9 @@ public class TesseractOCRConfig implements Serializable{
// The format of the ocr'ed output to be returned, txt or hocr.
private OUTPUT_TYPE outputType = OUTPUT_TYPE.TXT;
+ // whether or not to preserve interword spacing
+ private boolean preserveInterwordSpacing = false;
+
// enable image processing (optional)
private int enableImageProcessing = 0;
@@ -150,7 +153,9 @@ public class TesseractOCRConfig implements Serializable{
setOutputType(OUTPUT_TYPE.HOCR);
}
- // set parameters for ImageMagick
+ setPreserveInterwordSpacing(getProp(props, "preserveInterwordSpacing", false));
+
+ // set parameters for ImageMagick
setEnableImageProcessing(
getProp(props, "enableImageProcessing", isEnableImageProcessing()));
setImageMagickPath(
@@ -303,6 +308,23 @@ public class TesseractOCRConfig implements Serializable{
this.enableImageProcessing = enableImageProcessing;
}
+ /**
+ * Whether or not to maintain interword spacing. Default is <code>false</code>.
+ *
+ * @param preserveInterwordSpacing
+ */
+ public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing) {
+ this.preserveInterwordSpacing = preserveInterwordSpacing;
+ }
+
+ /**
+ *
+ * @return whether or not to maintain interword spacing.
+ */
+ public boolean getPreserveInterwordSpacing() {
+ return preserveInterwordSpacing;
+ }
+
/**
* @return the density
*/
@@ -459,4 +481,20 @@ public class TesseractOCRConfig implements Serializable{
private String getProp(Properties properties, String property, String defaultMissing) {
return properties.getProperty(property, defaultMissing);
}
+
+ private boolean getProp(Properties properties, String property, boolean defaultMissing) {
+ String propVal = properties.getProperty(property);
+ if (propVal == null) {
+ return defaultMissing;
+ }
+ if (propVal.equalsIgnoreCase("true")) {
+ return true;
+ } else if (propVal.equalsIgnoreCase("false")) {
+ return false;
+ }
+
+ throw new RuntimeException(String.format(Locale.ROOT,
+ "Cannot parse TesseractOCRConfig variable %s, invalid boolean value: %s",
+ property, propVal));
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/0d30aa1b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 0ac2b6b..feaedd1 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -418,9 +418,12 @@ public class TesseractOCRParser extends AbstractParser {
* if an input error occurred
*/
private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
- String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
- config.getLanguage(), "-psm", config.getPageSegMode(), config.getOutputType().name().toLowerCase(Locale.US)};
+ String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
+ config.getLanguage(), "-psm", config.getPageSegMode(),
+ config.getOutputType().name().toLowerCase(Locale.US),
+ "-c",
+ (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"};
ProcessBuilder pb = new ProcessBuilder(cmd);
setEnv(config, pb);
final Process process = pb.start();
@@ -480,8 +483,9 @@ public class TesseractOCRParser extends AbstractParser {
try (Reader reader = new InputStreamReader(stream, UTF_8)) {
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
- if (n > 0)
+ if (n > 0) {
xhtml.characters(buffer, 0, n);
+ }
}
}
xhtml.endElement("div");
http://git-wip-us.apache.org/repos/asf/tika/blob/0d30aa1b/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
index cb2151c..d81c339 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
@@ -18,4 +18,5 @@ language=eng
pageSegMode=1
maxFileSizeToOcr=2147483647
minFileSizeToOcr=0
-timeout=120
\ No newline at end of file
+timeout=120
+preserveInterwordSpacing=false
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/0d30aa1b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 82414ef..b09e754 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -23,6 +23,8 @@ import static org.junit.Assume.assumeTrue;
import java.io.InputStream;
import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
@@ -256,4 +258,24 @@ public class TesseractOCRParserTest extends TikaTest {
}
//TODO: add unit tests for jp2/jpx and ppm
+
+ @Test
+ public void testInterwordSpacing() throws Exception {
+ assumeTrue(canRun());
+ //default
+ String xml = getXML("testOCR_spacing.png").xml;
+ assertContains("The quick", xml);
+
+ TesseractOCRConfig tesseractOCRConfigconfig = new TesseractOCRConfig();
+ tesseractOCRConfigconfig.setPreserveInterwordSpacing(true);
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, tesseractOCRConfigconfig);
+
+ //with preserve interwordspacing "on"
+ //allow some flexibility in case Tesseract is computing spaces
+ //somewhat differently in different versions/OS's, etc.
+ xml = getXML("testOCR_spacing.png", parseContext).xml;
+ Matcher m = Pattern.compile("The\\s{5,20}quick").matcher(xml);
+ assertTrue(m.find());
+ }
}