You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/07 17:54:36 UTC
[tika] branch branch_1x updated: TIKA-2705 -- allow parameter
configuration for tesseract via tika-config.xml
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 375e3d7 TIKA-2705 -- allow parameter configuration for tesseract via tika-config.xml
375e3d7 is described below
commit 375e3d76d896656cbb5dd8c6eff6176f729336fb
Author: TALLISON <ta...@apache.org>
AuthorDate: Tue Aug 7 13:53:43 2018 -0400
TIKA-2705 -- allow parameter configuration for tesseract via tika-config.xml
---
.../apache/tika/parser/ocr/TesseractOCRConfig.java | 38 ++++-
.../apache/tika/parser/ocr/TesseractOCRParser.java | 161 ++++++++++++++++-----
.../tika/parser/ocr/TesseractOCRParserTest.java | 33 +++++
.../org/apache/tika/config/TIKA-2705-tesseract.xml | 33 +++++
4 files changed, 219 insertions(+), 46 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index c6ff21c..1c65ece 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -71,10 +71,10 @@ public class TesseractOCRConfig implements Serializable {
private String pageSegMode = "1";
// Minimum file size to submit file to ocr.
- private int minFileSizeToOcr = 0;
+ private long minFileSizeToOcr = 0;
// Maximum file size to submit file to ocr.
- private int maxFileSizeToOcr = Integer.MAX_VALUE;
+ private long maxFileSizeToOcr = Integer.MAX_VALUE;
// Maximum time (seconds) to wait for the ocring process termination
private int timeout = 120;
@@ -322,9 +322,9 @@ public class TesseractOCRConfig implements Serializable {
return preserveInterwordSpacing;
}
/**
- * @see #setMinFileSizeToOcr(int minFileSizeToOcr)
+ * @see #setMinFileSizeToOcr(long minFileSizeToOcr)
*/
- public int getMinFileSizeToOcr() {
+ public long getMinFileSizeToOcr() {
return minFileSizeToOcr;
}
@@ -332,14 +332,14 @@ public class TesseractOCRConfig implements Serializable {
* Set minimum file size to submit file to ocr.
* Default is 0.
*/
- public void setMinFileSizeToOcr(int minFileSizeToOcr) {
+ public void setMinFileSizeToOcr(long minFileSizeToOcr) {
this.minFileSizeToOcr = minFileSizeToOcr;
}
/**
- * @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)
+ * @see #setMaxFileSizeToOcr(long maxFileSizeToOcr)
*/
- public int getMaxFileSizeToOcr() {
+ public long getMaxFileSizeToOcr() {
return maxFileSizeToOcr;
}
@@ -347,7 +347,7 @@ public class TesseractOCRConfig implements Serializable {
* Set maximum file size to submit file to ocr.
* Default is Integer.MAX_VALUE.
*/
- public void setMaxFileSizeToOcr(int maxFileSizeToOcr) {
+ public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
this.maxFileSizeToOcr = maxFileSizeToOcr;
}
@@ -630,6 +630,28 @@ public class TesseractOCRConfig implements Serializable {
* @param defaultMissing default parameter to use.
* @return the value.
*/
+ private long getProp(Properties properties, String property, long defaultMissing) {
+ String p = properties.getProperty(property);
+ if (p == null || p.isEmpty()) {
+ return defaultMissing;
+ }
+ try {
+ return Integer.parseInt(p);
+ } catch (Throwable ex) {
+ throw new RuntimeException(String.format(Locale.ROOT, "Cannot parse TesseractOCRConfig variable %s, invalid integer value",
+ property), ex);
+ }
+ }
+
+
+ /**
+ * Get property from the properties file passed in.
+ *
+ * @param properties properties file to read from.
+ * @param property the property to fetch.
+ * @param defaultMissing default parameter to use.
+ * @return the value.
+ */
private String getProp(Properties properties, String property, String defaultMissing) {
return properties.getProperty(property, defaultMissing);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index d3238c2..df46f00 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -16,46 +16,13 @@
*/
package org.apache.tika.parser.ocr;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-import javax.imageio.ImageIO;
-import javax.xml.parsers.SAXParser;
-import java.awt.Image;
-import java.awt.image.BufferedImage;
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.io.Reader;
-import java.nio.charset.Charset;
-import java.nio.file.Files;
-import java.nio.file.Paths;
-import java.nio.file.StandardCopyOption;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.FutureTask;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
-
import org.apache.commons.exec.CommandLine;
import org.apache.commons.exec.DefaultExecutor;
import org.apache.commons.exec.PumpStreamHandler;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.SystemUtils;
+import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
import org.apache.tika.config.Param;
@@ -84,6 +51,39 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
+import javax.imageio.ImageIO;
+import java.awt.Image;
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.FutureTask;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
/**
* TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
* create a {@link TesseractOCRConfig} object and pass it through a
@@ -106,13 +106,14 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
private static final long serialVersionUID = -8167538283213097265L;
- private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig();
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList(new MediaType[]{
MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"),
MediaType.image("bmp"), MediaType.image("gif"), MediaType.image("jp2"),
MediaType.image("jpx"), MediaType.image("x-portable-pixmap")
})));
+ private final TesseractOCRConfig defaultConfig = new TesseractOCRConfig();
+
private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<>();
private static Map<String,Boolean> IMAGE_MAGICK_PRESENT = new HashMap<>();
@@ -120,7 +121,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
// If Tesseract is installed, offer our supported image types
- TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+ TesseractOCRConfig config = context.get(TesseractOCRConfig.class, defaultConfig);
if (hasTesseract(config)) {
return SUPPORTED_TYPES;
}
@@ -258,7 +259,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext)
throws IOException, SAXException, TikaException {
- TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+ TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, defaultConfig);
// If Tesseract is not on the path with the current config, do not try to run OCR
// getSupportedTypes shouldn't have listed us as handling it, so this should only
// occur if someone directly calls this parser, not via DefaultParser or similar
@@ -471,7 +472,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
//by sending in a bogus tesseract path via a custom TesseractOCRConfig.
//TODO: figure out how to solve that.
if (! hasWarned()) {
- if (hasTesseract(DEFAULT_CONFIG)) {
+ if (hasTesseract(defaultConfig)) {
problemHandler.handleInitializableProblem(this.getClass().getName(),
"Tesseract OCR is installed and will be automatically applied to image files unless\n" +
"you've excluded the TesseractOCRParser from the default parser.\n"+
@@ -696,5 +697,89 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
protected void warn() {
HAS_WARNED = true;
}
+
+ @Field
+ public void setTesseractPath(String tesseractPath) {
+ defaultConfig.setTesseractPath(tesseractPath);
+ }
+
+ @Field
+ public void setTessdataPath(String tessdataPath) {
+ defaultConfig.setTessdataPath(tessdataPath);
+ }
+
+ @Field
+ public void setLanguage(String language) {
+ defaultConfig.setLanguage(language);
+ }
+
+ @Field
+ public void setPageSegMode(String pageSegMode) {
+ defaultConfig.setPageSegMode(pageSegMode);
+ }
+
+ @Field
+ public void setMinFileSizeToOcr(long minFileSizeToOcr) {
+ defaultConfig.setMinFileSizeToOcr(minFileSizeToOcr);
+ }
+
+ @Field
+ public void setTimeout(int timeout) {
+ defaultConfig.setTimeout(timeout);
+ }
+
+ @Field
+ public void setOutputType(String outputType) {
+ defaultConfig.setOutputType(outputType);
+ }
+
+ @Field
+ public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing) {
+ defaultConfig.setPreserveInterwordSpacing(preserveInterwordSpacing);
+ }
+
+ @Field
+ public void setEnableImageProcessing(int enableImageProcessing) {
+ defaultConfig.setEnableImageProcessing(enableImageProcessing);
+ }
+
+ @Field
+ public void setImageMagickPath(String imageMagickPath) {
+ defaultConfig.setImageMagickPath(imageMagickPath);
+ }
+
+ @Field
+ public void setDensity(int density) {
+ defaultConfig.setDensity(density);
+ }
+
+ @Field
+ public void setDepth(int depth) {
+ defaultConfig.setDepth(depth);
+ }
+
+ @Field
+ public void setColorspace(String colorspace) {
+ defaultConfig.setColorspace(colorspace);
+ }
+
+ @Field
+ public void setFilter(String filter) {
+ defaultConfig.setFilter(filter);
+ }
+
+ @Field
+ public void setResize(int resize) {
+ defaultConfig.setResize(resize);
+ }
+
+ @Field
+ public void setApplyRotation(boolean applyRotation) {
+ defaultConfig.setApplyRotation(applyRotation);
+ }
+
+ public TesseractOCRConfig getDefaultConfig() {
+ return defaultConfig;
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 039adf7..aea2c96 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.ocr;
import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assume.assumeTrue;
@@ -26,10 +27,13 @@ import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -303,4 +307,33 @@ public class TesseractOCRParserTest extends TikaTest {
assertContains("Its had resolving otherwise she contented therefore", ocr);
}
}
+
+ @Test
+ public void testConfig() throws Exception {
+ TikaConfig config = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2705-tesseract.xml"));
+ Parser p = config.getParser();
+ Parser tesseractOCRParser = findParser(p, org.apache.tika.parser.ocr.TesseractOCRParser.class);
+ assertNotNull(tesseractOCRParser);
+
+ TesseractOCRConfig tesseractOCRConfig = ((TesseractOCRParser)tesseractOCRParser).getDefaultConfig();
+ assertEquals(241, tesseractOCRConfig.getTimeout());
+ assertEquals(TesseractOCRConfig.OUTPUT_TYPE.HOCR, tesseractOCRConfig.getOutputType());
+ assertEquals("ceb", tesseractOCRConfig.getLanguage());
+ assertEquals(false, tesseractOCRConfig.getApplyRotation());
+ assertContains("myspecial", tesseractOCRConfig.getTesseractPath());
+ }
+
+ private Parser findParser(Parser parser, Class clazz) {
+ if (parser instanceof CompositeParser) {
+ for (Parser child : ((CompositeParser)parser).getAllComponentParsers()) {
+ Parser found = findParser(child, clazz);
+ if (found != null) {
+ return found;
+ }
+ }
+ } else if (clazz.isInstance(parser)) {
+ return parser;
+ }
+ return null;
+ }
}
diff --git a/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
new file mode 100644
index 0000000..c77d7e4
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
+ <params>
+ <param name="timeout" type="int">241</param>
+ <param name="tesseractPath" type="string">/myspecial/tess</param>
+ <param name="outputType" type="string">hocr</param>
+ <param name="applyRotation" type="bool">false</param>
+ <param name="language" type="string">ceb</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>