You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/20 21:32:55 UTC
[4/4] tika git commit: TIKA-2190 -- add configurability for preserve
interword spacing
TIKA-2190 -- add configurability for preserve interword spacing
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ae44b9e5
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ae44b9e5
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ae44b9e5
Branch: refs/heads/master
Commit: ae44b9e507dbb11b9b9f5c57cf342b47966ffb66
Parents: c83f87b
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 20 16:32:44 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 20 16:32:44 2016 -0500
----------------------------------------------------------------------
CHANGES.txt | 3 ++
.../tika/parser/ocr/TesseractOCRConfig.java | 38 +++++++++++++++++++
.../tika/parser/ocr/TesseractOCRParser.java | 10 +++--
.../parser/ocr/TesseractOCRConfig.properties | 1 +
.../tika/parser/ocr/TesseractOCRParserTest.java | 22 +++++++++++
.../test-documents/testOCR_spacing.png | Bin 0 -> 7232 bytes
6 files changed, 71 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/ae44b9e5/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index e215499..bd6a45e 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.15 - ??
+ * Add configurability of "preserve-interword-spacing" to
+ TesseractOCRParser (TIKA-2190).
+
* Added experimental SAX parser for .pptx files. To select this parser,
set useSAXPptxExtractor(true) on OfficeParserConfig (TIKA-2210).
http://git-wip-us.apache.org/repos/asf/tika/blob/ae44b9e5/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index a08b0e2..27853d3 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -91,6 +91,10 @@ public class TesseractOCRConfig implements Serializable {
// factor by which image is to be scaled.
private int resize = 900;
+ // whether or not to preserve interword spacing
+ private boolean preserveInterwordSpacing = false;
+
+
/**
* Default contructor.
*/
@@ -148,6 +152,7 @@ public class TesseractOCRConfig implements Serializable {
} else if ("hocr".equals(outputTypeString)) {
setOutputType(OUTPUT_TYPE.HOCR);
}
+ setPreserveInterwordSpacing(getProp(props, "preserveInterwordSpacing", false));
// set parameters for ImageMagick
setEnableImageProcessing(
@@ -244,6 +249,22 @@ public class TesseractOCRConfig implements Serializable {
}
/**
+ * Whether or not to maintain interword spacing. Default is <code>false</code>.
+ *
+ * @param preserveInterwordSpacing
+ */
+ public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing) {
+ this.preserveInterwordSpacing = preserveInterwordSpacing;
+ }
+
+ /**
+ *
+ * @return whether or not to maintain interword spacing.
+ */
+ public boolean getPreserveInterwordSpacing() {
+ return preserveInterwordSpacing;
+ }
+ /**
* @see #setMinFileSizeToOcr(int minFileSizeToOcr)
*/
public int getMinFileSizeToOcr() {
@@ -481,4 +502,21 @@ public class TesseractOCRConfig implements Serializable {
private String getProp(Properties properties, String property, String defaultMissing) {
return properties.getProperty(property, defaultMissing);
}
+
+ private boolean getProp(Properties properties, String property, boolean defaultMissing) {
+ String propVal = properties.getProperty(property);
+ if (propVal == null) {
+ return defaultMissing;
+ }
+ if (propVal.equalsIgnoreCase("true")) {
+ return true;
+ } else if (propVal.equalsIgnoreCase("false")) {
+ return false;
+ }
+
+ throw new RuntimeException(String.format(Locale.ROOT,
+ "Cannot parse TesseractOCRConfig variable %s, invalid boolean value: %s",
+ property, propVal));
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/ae44b9e5/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index ffbef1c..46a3f55 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -417,9 +417,12 @@ public class TesseractOCRParser extends AbstractParser {
* if an input error occurred
*/
private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
- String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
- config.getLanguage(), "-psm", config.getPageSegMode(), config.getOutputType().name().toLowerCase(Locale.US)};
+ String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
+ config.getLanguage(), "-psm", config.getPageSegMode(),
+ config.getOutputType().name().toLowerCase(Locale.US),
+ "-c",
+ (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"};
ProcessBuilder pb = new ProcessBuilder(cmd);
setEnv(config, pb);
final Process process = pb.start();
@@ -479,8 +482,9 @@ public class TesseractOCRParser extends AbstractParser {
try (Reader reader = new InputStreamReader(stream, UTF_8)) {
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
- if (n > 0)
+ if (n > 0) {
xhtml.characters(buffer, 0, n);
+ }
}
}
xhtml.endElement("div");
http://git-wip-us.apache.org/repos/asf/tika/blob/ae44b9e5/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
index ddcd473..3cc2bcd 100644
--- a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
@@ -21,6 +21,7 @@ maxFileSizeToOcr=2147483647
minFileSizeToOcr=0
timeout=120
outputType=txt
+preserveInterwordSpacing=false
# properties for image processing
# to enable processing, set enableImageProcessing to 1
http://git-wip-us.apache.org/repos/asf/tika/blob/ae44b9e5/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index e0f89ac..4c0ab76 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -23,6 +23,8 @@ import static org.junit.Assume.assumeTrue;
import java.io.InputStream;
import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
@@ -256,4 +258,24 @@ public class TesseractOCRParserTest extends TikaTest {
}
//TODO: add unit tests for jp2/jpx/ppm TIKA-2174
+
+ @Test
+ public void testInterwordSpacing() throws Exception {
+ assumeTrue(canRun());
+ //default
+ String xml = getXML("testOCR_spacing.png").xml;
+ assertContains("The quick", xml);
+
+ TesseractOCRConfig tesseractOCRConfigconfig = new TesseractOCRConfig();
+ tesseractOCRConfigconfig.setPreserveInterwordSpacing(true);
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, tesseractOCRConfigconfig);
+
+ //with preserve interwordspacing "on"
+ //allow some flexibility in case Tesseract is computing spaces
+ //somewhat differently in different versions/OS's, etc.
+ xml = getXML("testOCR_spacing.png", parseContext).xml;
+ Matcher m = Pattern.compile("The\\s{5,20}quick").matcher(xml);
+ assertTrue(m.find());
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/ae44b9e5/tika-parsers/src/test/resources/test-documents/testOCR_spacing.png
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testOCR_spacing.png b/tika-parsers/src/test/resources/test-documents/testOCR_spacing.png
new file mode 100644
index 0000000..10a7e40
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testOCR_spacing.png differ