You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/09 21:02:29 UTC
[tika] branch main updated: TIKA-3297 -- remove .properties as an
option for parsers...starting with tesseract; pdf in a followup commit.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 3f80efd TIKA-3297 -- remove .properties as an option for parsers...starting with tesseract; pdf in a followup commit.
new e8b47f7 Merge remote-tracking branch 'origin/main' into main
3f80efd is described below
commit 3f80efda2890673160492bd4b9d570c88ff89de9
Author: tballison <ta...@apache.org>
AuthorDate: Tue Feb 9 16:02:01 2021 -0500
TIKA-3297 -- remove .properties as an option for parsers...starting with tesseract; pdf in a followup commit.
---
.../java/org/apache/tika/config/TikaConfig.java | 5 +-
.../apache/tika/parser/ocr/ImagePreprocessor.java | 50 +--
.../apache/tika/parser/ocr/TesseractOCRConfig.java | 375 +++++----------------
.../apache/tika/parser/ocr/TesseractOCRParser.java | 314 +++++++++++------
.../tika/parser/ocr/TesseractOCRConfig.properties | 38 ---
.../tika/parser/ocr/TesseractOCRConfigTest.java | 105 +++---
.../tika/parser/ocr/TesseractOCRParserTest.java | 116 ++++---
.../StringsConfig-full.properties | 0
.../StringsConfig-partial.properties | 0
.../TIKA-2705-tesseract.xml | 1 -
.../tika-config-tesseract-arbitrary.xml} | 13 +-
.../tika-config-tesseract-full.xml} | 20 +-
.../tika-config-tesseract-partial.xml} | 17 +-
.../TesseractOCRConfig-full.properties | 29 --
.../TesseractOCRConfig-partial.properties | 24 --
.../tika/parser/ocr/TesseractOCRParserTest.java | 23 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 6 +-
17 files changed, 482 insertions(+), 654 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index e29a337..041a6a4 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -654,7 +654,8 @@ public class TikaConfig {
List<T> loaded = new ArrayList<T>();
// Find the children of the parent tag, if any
- for (Element le : getTopLevelElementChildren(element, getParentTagName(), getLoaderTagName())) {T loadedChild = loadOne(le, mimeTypes, loader);
+ for (Element le : getTopLevelElementChildren(element, getParentTagName(), getLoaderTagName())) {
+ T loadedChild = loadOne(le, mimeTypes, loader);
if (loadedChild != null) loaded.add(loadedChild);
}
@@ -742,7 +743,7 @@ public class TikaConfig {
excludeChildren.add(loader.getServiceClass(getLoaderClass(), exclName));
} catch (ClassNotFoundException e) {
//TIKA-3268 -- This should stop the world.
- throw new TikaConfigException("Class now found in -exclude list: " + exclName);
+ throw new TikaConfigException("Class not found in -exclude list: " + exclName);
}
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
index 9c84227..764f16e 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
@@ -33,6 +33,7 @@ import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
+import java.io.Serializable;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -47,45 +48,14 @@ import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.stream.Stream;
-class ImagePreprocessor {
+class ImagePreprocessor implements Serializable {
private static final Map<String, Boolean> IMAGE_MAGICK_PRESENT = new HashMap<>();
private static final Logger LOG = LoggerFactory.getLogger(TesseractOCRParser.class);
private static final double MINIMUM_DESKEW_THRESHOLD = 1.0D;
- public static boolean hasImageMagick(TesseractOCRConfig config) {
- // Fetch where the config says to find ImageMagick Program
- String ImageMagick = getImageMagickPath(config);
-
- // Have we already checked for a copy of ImageMagick Program there?
- if (IMAGE_MAGICK_PRESENT.containsKey(ImageMagick)) {
- return IMAGE_MAGICK_PRESENT.get(ImageMagick);
- }
- //prevent memory bloat
- if (IMAGE_MAGICK_PRESENT.size() > 100) {
- IMAGE_MAGICK_PRESENT.clear();
- }
- //check that directory exists
- if (!config.getImageMagickPath().isEmpty() &&
- ! Files.isDirectory(Paths.get(config.getImageMagickPath()))) {
- IMAGE_MAGICK_PRESENT.put(ImageMagick, false);
- return false;
- }
-
- // Try running ImageMagick program from there, and see if it exists + works
- String[] checkCmd = { ImageMagick };
- boolean hasImageMagick = ExternalParser.check(checkCmd);
- if (!hasImageMagick) {
- LOG.warn("ImageMagick does not appear to be installed " +
- "(commandline: "+ImageMagick+")");
- }
- IMAGE_MAGICK_PRESENT.put(ImageMagick, hasImageMagick);
-
- return hasImageMagick;
- }
-
-
- private static String getImageMagickPath(TesseractOCRConfig config) {
- return config.getImageMagickPath() + getImageMagickProg();
+ private final String fullImageMagickPath;
+ ImagePreprocessor(String fullImageMagickPath) {
+ this.fullImageMagickPath = fullImageMagickPath;
}
@@ -100,10 +70,7 @@ class ImagePreprocessor {
if (config.isEnableImageProcessing() || config.isApplyRotation() && angle != 0) {
// process the image - parameter values can be set in TesseractOCRConfig.properties
- CommandLine commandLine = new CommandLine(getImageMagickPath(config));
- if (System.getProperty("os.name").startsWith("Windows")) {
- commandLine.addArgument("convert");
- }
+ CommandLine commandLine = new CommandLine(fullImageMagickPath);
// Arguments for ImageMagick
final List<String> density = Arrays.asList("-density", Integer.toString(config.getDensity()));
@@ -179,8 +146,5 @@ class ImagePreprocessor {
return angle;
}
- public static String getImageMagickProg() {
- return System.getProperty("os.name").startsWith("Windows") ?
- "magick" : "convert";
- }
+
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 6c78cc1..ad68c4e 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -16,39 +16,36 @@
*/
package org.apache.tika.parser.ocr;
-import org.apache.commons.io.FilenameUtils;
+import org.apache.tika.exception.TikaException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
import java.io.Serializable;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
-import java.util.Properties;
+import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Configuration for TesseractOCRParser.
+ * This class is not thread safe and must be synchronized externally.
* <p>
- * This allows to enable TesseractOCRParser and set its parameters:
- * <p>
- * TesseractOCRConfig config = new TesseractOCRConfig();<br>
- * config.setTesseractPath(tesseractFolder);<br>
- * parseContext.set(TesseractOCRConfig.class, config);<br>
- * </p>
- * <p>
- * Parameters can also be set by either editing the existing TesseractOCRConfig.properties file in,
- * tika-parser/src/main/resources/org/apache/tika/parser/ocr, or overriding it by creating your own
- * and placing it in the package org/apache/tika/parser/ocr on the classpath.
+ * This class will remember all set* field forever,
+ * and on {@link #cloneAndUpdate(TesseractOCRConfig)},
+ * it will update all the fields that have been set on the "update" config.
+ * So, for example, if you want to change language to "fra"
+ * from "eng" and then on another parse,
+ * you want to change depth to 5 on the same update object,
+ * but you expect the language to revert to "eng", you'll be wrong.
+ * Create a new update config for each parse unless you're only changing the
+ * same field(s) with every parse.
*/
public class TesseractOCRConfig implements Serializable {
@@ -67,14 +64,6 @@ public class TesseractOCRConfig implements Serializable {
HOCR
}
- // Path to tesseract installation folder, if not on system path.
- private String tesseractPath = "";
-
- // Path to the 'tessdata' folder, which contains language files and config files.
- private String tessdataPath = "";
-
- private Path actualTessdataPath;
-
// Language dictionary to be used.
private String language = "eng";
@@ -88,7 +77,7 @@ public class TesseractOCRConfig implements Serializable {
private long maxFileSizeToOcr = Integer.MAX_VALUE;
// Maximum time (seconds) to wait for the ocring process termination
- private int timeout = 120;
+ private int timeoutSeconds = 120;
// The format of the ocr'ed output to be returned, txt or hocr.
private OUTPUT_TYPE outputType = OUTPUT_TYPE.TXT;
@@ -96,9 +85,6 @@ public class TesseractOCRConfig implements Serializable {
// enable image processing (optional)
private boolean enableImageProcessing = false;
- // Path to ImageMagick program, if not on system path.
- private String imageMagickPath = "";
-
// resolution of processed image (in dpi).
private int density = 300;
@@ -125,128 +111,13 @@ public class TesseractOCRConfig implements Serializable {
// whether or not to apply rotation calculated by the rotation.py script
private boolean applyRotation = false;
+ // runtime switch to turn off OCR
+ private boolean skipOcr = false;
+
// See addOtherTesseractConfig.
private Map<String, String> otherTesseractConfig = new HashMap<>();
-
- /**
- * Default constructor.
- */
- public TesseractOCRConfig() {
- init(this.getClass().getResourceAsStream("TesseractOCRConfig.properties"));
- }
-
- /**
- * Loads properties from InputStream and then tries to close InputStream.
- * If there is an IOException, this silently swallows the exception
- * and goes back to the default.
- *
- * @param is
- */
- public TesseractOCRConfig(InputStream is) {
- init(is);
- }
-
- private void init(InputStream is) {
- if (is == null) {
- return;
- }
- Properties props = new Properties();
- try {
- props.load(is);
- } catch (IOException e) {
- } finally {
- if (is != null) {
- try {
- is.close();
- } catch (IOException e) {
- //swallow
- }
- }
- }
-
- // set parameters for Tesseract
- setTesseractPath(
- getProp(props, "tesseractPath", getTesseractPath()));
- setTessdataPath(
- getProp(props, "tessdataPath", getTessdataPath()));
- setLanguage(
- getProp(props, "language", getLanguage()));
- setPageSegMode(
- getProp(props, "pageSegMode", getPageSegMode()));
- setMinFileSizeToOcr(
- getProp(props, "minFileSizeToOcr", getMinFileSizeToOcr()));
- setMaxFileSizeToOcr(
- getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr()));
- setTimeout(
- getProp(props, "timeout", getTimeout()));
- setOutputType(getProp(props, "outputType", getOutputType().toString()));
- setPreserveInterwordSpacing(getProp(props, "preserveInterwordSpacing", false));
-
- // set parameters for ImageMagick
- setEnableImageProcessing(
- getProp(props, "enableImageProcessing", isEnableImageProcessing()));
- setImageMagickPath(
- getProp(props, "ImageMagickPath", getImageMagickPath()));
- setDensity(
- getProp(props, "density", getDensity()));
- setDepth(
- getProp(props, "depth", getDepth()));
- setColorspace(
- getProp(props, "colorspace", getColorspace()));
- setFilter(
- getProp(props, "filter", getFilter()));
- setResize(
- getProp(props, "resize", getResize()));
- setApplyRotation(
- getProp(props, "applyRotation", isApplyRotation()));
-
- loadOtherTesseractConfig(props);
- }
-
- /**
- * @see #setTesseractPath(String tesseractPath)
- */
- public String getTesseractPath() {
- return tesseractPath;
- }
-
- /**
- * Set the path to the Tesseract executable's directory, needed if it is not on system path.
- * <p>
- * Note that if you set this value, it is highly recommended that you also
- * set the path to the 'tessdata' folder using {@link #setTessdataPath}.
- * </p>
- */
- public void setTesseractPath(String tesseractPath) {
-
- tesseractPath = FilenameUtils.normalize(tesseractPath);
- if (!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator))
- tesseractPath += File.separator;
-
- this.tesseractPath = tesseractPath;
- }
-
- /**
- * @see #setTessdataPath(String tessdataPath)
- */
- public String getTessdataPath() {
- return tessdataPath;
- }
-
- /**
- * Set the path to the 'tessdata' folder, which contains language files and config files. In some cases (such
- * as on Windows), this folder is found in the Tesseract installation, but in other cases
- * (such as when Tesseract is built from source), it may be located elsewhere.
- */
- public void setTessdataPath(String tessdataPath) {
- tessdataPath = FilenameUtils.normalize(tessdataPath);
- if (!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator))
- tessdataPath += File.separator;
-
- this.tessdataPath = tessdataPath;
- }
-
+ private Set<String> userConfigured = new HashSet<>();
/**
* @see #setLanguage(String language)
*/
@@ -279,8 +150,6 @@ public class TesseractOCRConfig implements Serializable {
// First, make sure it conforms to the correct syntax
if (!lang.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+")) {
invalidCodes.add(lang + " (invalid syntax)");
- } else if (!langExists(lang)) {
- invalidCodes.add(lang + " (not found)");
}
}
if (!invalidCodes.isEmpty()) {
@@ -288,30 +157,7 @@ public class TesseractOCRConfig implements Serializable {
"Invalid language code(s): " + invalidCodes);
}
this.language = language;
- }
- /**
- * Check if tessdata language model exists
- */
- private boolean langExists(String lang) {
- if (actualTessdataPath == null) {
- // Use the same logic used in TesseractOCRParser.setEnv().
- // If tessdataPath is not specified then use tesseractPath, if specified
- if (!tessdataPath.isEmpty()) {
- actualTessdataPath = Paths.get(tessdataPath);
- } else if (!tesseractPath.isEmpty()) {
- actualTessdataPath = Paths.get(tesseractPath, "tessdata");
- } else {
- // Neither path was specified, so we'll just assume
- // the language is good and rely on Tesseract to tell us if there's a problem
- return true;
- }
- }
-
- if (!Files.isDirectory(actualTessdataPath)) {
- throw new IllegalArgumentException(actualTessdataPath + " is not a directory");
- }
- String trainedDataName = lang + ".traineddata";
- return Files.isRegularFile(actualTessdataPath.resolve(trainedDataName));
+ userConfigured.add("language");
}
/**
@@ -330,6 +176,7 @@ public class TesseractOCRConfig implements Serializable {
throw new IllegalArgumentException("Invalid page segmentation mode");
}
this.pageSegMode = pageSegMode;
+ userConfigured.add("pageSegMode");
}
/**
@@ -354,6 +201,7 @@ public class TesseractOCRConfig implements Serializable {
"If you trust this value, set it with setTrustedPageSeparator");
}
setTrustedPageSeparator(pageSeparator);
+ userConfigured.add("pageSeparator");
}
/**
@@ -373,6 +221,7 @@ public class TesseractOCRConfig implements Serializable {
*/
public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing) {
this.preserveInterwordSpacing = preserveInterwordSpacing;
+ userConfigured.add("preserveInterwordSpacing");
}
/**
@@ -395,6 +244,7 @@ public class TesseractOCRConfig implements Serializable {
*/
public void setMinFileSizeToOcr(long minFileSizeToOcr) {
this.minFileSizeToOcr = minFileSizeToOcr;
+ userConfigured.add("minFileSizeToOcr");
}
/**
@@ -410,22 +260,24 @@ public class TesseractOCRConfig implements Serializable {
*/
public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
this.maxFileSizeToOcr = maxFileSizeToOcr;
+ userConfigured.add("maxFileSizeToOcr");
}
/**
* Set maximum time (seconds) to wait for the ocring process to terminate.
* Default value is 120s.
*/
- public void setTimeout(int timeout) {
- this.timeout = timeout;
+ public void setTimeoutSeconds(int timeoutSeconds) {
+ this.timeoutSeconds = timeoutSeconds;
+ userConfigured.add("timeoutSeconds");
}
/**
* @return timeout value for Tesseract
- * @see #setTimeout(int timeout)
+ * @see #setTimeoutSeconds(int timeout)
*/
- public int getTimeout() {
- return timeout;
+ public int getTimeoutSeconds() {
+ return timeoutSeconds;
}
/**
@@ -434,6 +286,7 @@ public class TesseractOCRConfig implements Serializable {
*/
public void setOutputType(OUTPUT_TYPE outputType) {
this.outputType = outputType;
+ userConfigured.add("outputType");
}
public void setOutputType(String outputType) {
@@ -448,8 +301,6 @@ public class TesseractOCRConfig implements Serializable {
} else {
throw new IllegalArgumentException("outputType must be either 'txt' or 'hocr'");
}
-
-
}
/**
@@ -473,6 +324,7 @@ public class TesseractOCRConfig implements Serializable {
*/
public void setEnableImageProcessing(boolean enableImageProcessing) {
this.enableImageProcessing = enableImageProcessing;
+ userConfigured.add("enableImageProcessing");
}
/**
@@ -491,6 +343,7 @@ public class TesseractOCRConfig implements Serializable {
throw new IllegalArgumentException("Invalid density value. Valid range of values is 150-1200.");
}
this.density = density;
+ userConfigured.add("density");
}
/**
@@ -509,6 +362,7 @@ public class TesseractOCRConfig implements Serializable {
for (int allowedValue : allowedValues) {
if (depth == allowedValue) {
this.depth = depth;
+ userConfigured.add("depth");
return;
}
}
@@ -534,6 +388,7 @@ public class TesseractOCRConfig implements Serializable {
throw new IllegalArgumentException("colorspace must match this pattern: (?i)^[-_A-Z0-9]+$");
}
this.colorspace = colorspace;
+ userConfigured.add("colorspace");
}
/**
@@ -557,6 +412,7 @@ public class TesseractOCRConfig implements Serializable {
for (String allowedFilter : allowedFilters) {
if (filter.equalsIgnoreCase(allowedFilter)) {
this.filter = filter;
+ userConfigured.add("filter");
return;
}
}
@@ -565,6 +421,20 @@ public class TesseractOCRConfig implements Serializable {
}
/**
+ * If you want to turn off OCR at run time for a specific file,
+ * set this to <code>true</code>
+ * @param skipOcr
+ */
+ public void setSkipOcr(boolean skipOcr) {
+ this.skipOcr = skipOcr;
+ userConfigured.add("skipOcr");
+ }
+
+ public boolean isSkipOcr() {
+ return skipOcr;
+ }
+
+ /**
* @return the resize
*/
public int getResize() {
@@ -579,6 +449,7 @@ public class TesseractOCRConfig implements Serializable {
for (int i = 1; i < 10; i++) {
if (resize == i * 100) {
this.resize = resize;
+ userConfigured.add("resize");
return;
}
}
@@ -586,29 +457,6 @@ public class TesseractOCRConfig implements Serializable {
}
/**
- * @return path to ImageMagick executable directory.
- * @see #setImageMagickPath(String imageMagickPath)
- */
- public String getImageMagickPath() {
-
- return imageMagickPath;
- }
-
- /**
- * Set the path to the ImageMagick executable directory, needed if it is not on system path.
- *
- * @param imageMagickPath to ImageMagick executable directory.
- */
- public void setImageMagickPath(String imageMagickPath) {
- imageMagickPath = FilenameUtils.normalize(imageMagickPath);
- if (!imageMagickPath.isEmpty() && !imageMagickPath.endsWith(File.separator)) {
- imageMagickPath += File.separator;
- }
- this.imageMagickPath = imageMagickPath;
- }
-
-
- /**
* @return Whether or not a rotation value should be calculated and passed to ImageMagick before performing OCR.
*/
public boolean isApplyRotation() {
@@ -622,6 +470,7 @@ public class TesseractOCRConfig implements Serializable {
*/
public void setApplyRotation(boolean applyRotation) {
this.applyRotation = applyRotation;
+ userConfigured.add("applyRotation");
}
/**
@@ -658,93 +507,43 @@ public class TesseractOCRConfig implements Serializable {
if (!m.find()) {
throw new IllegalArgumentException("Value contains illegal characters: " + value);
}
-
otherTesseractConfig.put(key.trim(), value.trim());
+ userConfigured.add("otherTesseractConfig");
}
- /**
- * Get property from the properties file passed in.
- *
- * @param properties properties file to read from.
- * @param property the property to fetch.
- * @param defaultMissing default parameter to use.
- * @return the value.
- */
- private int getProp(Properties properties, String property, int defaultMissing) {
- String p = properties.getProperty(property);
- if (p == null || p.isEmpty()) {
- return defaultMissing;
- }
- try {
- return Integer.parseInt(p);
- } catch (Throwable ex) {
- throw new RuntimeException(String.format(Locale.ROOT, "Cannot parse TesseractOCRConfig variable %s, invalid integer value",
- property), ex);
- }
- }
-
- /**
- * Get property from the properties file passed in.
- *
- * @param properties properties file to read from.
- * @param property the property to fetch.
- * @param defaultMissing default parameter to use.
- * @return the value.
- */
- private long getProp(Properties properties, String property, long defaultMissing) {
- String p = properties.getProperty(property);
- if (p == null || p.isEmpty()) {
- return defaultMissing;
- }
- try {
- return Integer.parseInt(p);
- } catch (Throwable ex) {
- throw new RuntimeException(String.format(Locale.ROOT, "Cannot parse TesseractOCRConfig variable %s, invalid integer value",
- property), ex);
- }
- }
-
-
- /**
- * Get property from the properties file passed in.
- *
- * @param properties properties file to read from.
- * @param property the property to fetch.
- * @param defaultMissing default parameter to use.
- * @return the value.
- */
- private String getProp(Properties properties, String property, String defaultMissing) {
- return properties.getProperty(property, defaultMissing);
- }
-
- private boolean getProp(Properties properties, String property, boolean defaultMissing) {
- String propVal = properties.getProperty(property);
- if (propVal == null) {
- return defaultMissing;
- }
- if (propVal.equalsIgnoreCase("true")) {
- return true;
- } else if (propVal.equalsIgnoreCase("false")) {
- return false;
- }
-
- throw new RuntimeException(String.format(Locale.ROOT,
- "Cannot parse TesseractOCRConfig variable %s, invalid boolean value: %s",
- property, propVal));
- }
-
- /**
- * Populate otherTesseractConfig from the given properties.
- * This assumes that any key-value pair where the key contains
- * an underscore is an option to be passed opaquely to Tesseract.
- *
- * @param properties properties file to read from.
- */
- private void loadOtherTesseractConfig(Properties properties) {
- for (String k : properties.stringPropertyNames()) {
- if (k.contains("_")) {
- addOtherTesseractConfig(k, properties.getProperty(k));
+ public TesseractOCRConfig cloneAndUpdate(TesseractOCRConfig updates) throws TikaException {
+ TesseractOCRConfig updated = new TesseractOCRConfig();
+ for (Field field : this.getClass().getDeclaredFields()) {
+ if (Modifier.isFinal(field.getModifiers())) {
+ continue;
+ } else if (Modifier.isStatic(field.getModifiers())) {
+ continue;
+ }
+ if ("userConfigured".equals(field.getName())) {
+ continue;
+ }
+ if ("otherTesseractConfig".equals(field.getName())
+ && updates.userConfigured.contains(field.getName())) {
+ //deep copy
+ for (Map.Entry<String, String> e : updates.getOtherTesseractConfig().entrySet()) {
+ updated.addOtherTesseractConfig(e.getKey(), e.getValue());
+ }
+ continue;
+ }
+ if (updates.userConfigured.contains(field.getName())) {
+ try {
+ field.set(updated, field.get(updates));
+ } catch (IllegalAccessException e) {
+ throw new TikaException("can't update " + field.getName(), e);
+ }
+ } else {
+ try {
+ field.set(updated, field.get(this));
+ } catch (IllegalAccessException e) {
+ throw new TikaException("can't update " + field.getName(), e);
+ }
}
}
+ return updated;
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 54d9388..6aea516 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.ocr;
+import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
@@ -33,6 +34,7 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -60,19 +62,12 @@ import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
-import java.util.HashMap;
import java.util.HashSet;
+import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.Future;
-import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
-import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.concurrent.atomic.AtomicInteger;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
@@ -88,13 +83,12 @@ import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
* config.setTesseractPath(tesseractFolder);<br>
* parseContext.set(TesseractOCRConfig.class, config);<br>
* </p>
- *
- *
*/
-public class TesseractOCRParser extends AbstractParser {
+public class TesseractOCRParser extends AbstractParser implements Initializable {
+
public static final String TESS_META = "tess:";
- public static final Property IMAGE_ROTATION = Property.externalRealSeq(TESS_META+"rotation");
- public static final Property IMAGE_MAGICK = Property.externalBooleanSeq(TESS_META+"image_magick_processed");
+ public static final Property IMAGE_ROTATION = Property.externalRealSeq(TESS_META + "rotation");
+ public static final Property IMAGE_MAGICK = Property.externalBooleanSeq(TESS_META + "image_magick_processed");
private static final String OCR = "ocr-";
private static final Logger LOG = LoggerFactory.getLogger(TesseractOCRParser.class);
@@ -105,79 +99,103 @@ public class TesseractOCRParser extends AbstractParser {
private static final long serialVersionUID = -8167538283213097265L;
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList(new MediaType[]{
- MediaType.image(OCR+"png"),
- MediaType.image(OCR+"jpeg"),
- MediaType.image(OCR+"tiff"),
- MediaType.image(OCR+"bmp"),
- MediaType.image(OCR+"gif"),
+ MediaType.image(OCR + "png"),
+ MediaType.image(OCR + "jpeg"),
+ MediaType.image(OCR + "tiff"),
+ MediaType.image(OCR + "bmp"),
+ MediaType.image(OCR + "gif"),
//these are not currently covered by other parsers
MediaType.image("jp2"),
MediaType.image("jpx"),
MediaType.image("x-portable-pixmap"),
//add the ocr- versions as well
- MediaType.image(OCR+"jp2"),
- MediaType.image(OCR+"jpx"),
- MediaType.image(OCR+"x-portable-pixmap"),
+ MediaType.image(OCR + "jp2"),
+ MediaType.image(OCR + "jpx"),
+ MediaType.image(OCR + "x-portable-pixmap"),
})));
+
+ private String tesseractPath = "";
+ private String tessdataPath = "";
+ private String imageMagickPath = "";
+ //if a user specifies a custom tess path or tessdata path
+ //load the available languages at initialization time
+ private final Set<String> langs = new HashSet<>();
+
private final TesseractOCRConfig defaultConfig = new TesseractOCRConfig();
- private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<>();
- static final ImagePreprocessor IMAGE_PREPROCESSOR = new ImagePreprocessor();
+ private boolean hasTesseract;
+ private boolean hasImageMagick;
+ private ImagePreprocessor imagePreprocessor;
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
// If Tesseract is installed, offer our supported image types
- TesseractOCRConfig config = context.get(TesseractOCRConfig.class, defaultConfig);
- if (hasTesseract(config)) {
- return SUPPORTED_TYPES;
+ TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
+ if (hasTesseract) {
+ if (config == null || !config.isSkipOcr()) {
+ return SUPPORTED_TYPES;
+ }
}
// Otherwise don't advertise anything, so the other image parsers
// can be selected instead
return Collections.emptySet();
}
- private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
+ private void setEnv(ProcessBuilder pb) {
String tessdataPrefix = "TESSDATA_PREFIX";
Map<String, String> env = pb.environment();
- if (!config.getTessdataPath().isEmpty()) {
- env.put(tessdataPrefix, config.getTessdataPath());
- }
- else if(!config.getTesseractPath().isEmpty()) {
- env.put(tessdataPrefix, config.getTesseractPath());
+ if (!StringUtils.isBlank(getTessdataPath())) {
+ env.put(tessdataPrefix, getTessdataPath());
+ } else if (!StringUtils.isBlank(getTesseractPath())) {
+ env.put(tessdataPrefix, getTesseractPath());
}
}
- public boolean hasTesseract(TesseractOCRConfig config) {
+ public boolean hasTesseract() throws TikaConfigException {
// Fetch where the config says to find Tesseract
- String tesseract = config.getTesseractPath() + getTesseractProg();
-
- // Have we already checked for a copy of Tesseract there?
- if (TESSERACT_PRESENT.containsKey(tesseract)) {
- return TESSERACT_PRESENT.get(tesseract);
- }
- //prevent memory bloat
- if (TESSERACT_PRESENT.size() > 100) {
- TESSERACT_PRESENT.clear();
- }
- //check that the parent directory exists
- if (! config.getTesseractPath().isEmpty() &&
- ! Files.isDirectory(Paths.get(config.getTesseractPath()))) {
- TESSERACT_PRESENT.put(tesseract, false);
- LOG.warn("You haven't specified an existing directory in " +
- "which the tesseract binary should be found: " +
- "(path:" + config.getTesseractPath()+")");
- return false;
+ String tesseract = getTesseractPath() + getTesseractProg();
+
+ if (!StringUtils.isBlank(tesseractPath) &&
+ !Files.isDirectory(Paths.get(tesseractPath))) {
+ throw new TikaConfigException("tesseractPath (" + tesseractPath + ") " +
+ "doesn't point to an existing directory");
}
// Try running Tesseract from there, and see if it exists + works
- String[] checkCmd = { tesseract };
+ String[] checkCmd = {tesseract};
boolean hasTesseract = ExternalParser.check(checkCmd);
- LOG.debug("hasTesseract (path: "+checkCmd+"): "+hasTesseract);
- TESSERACT_PRESENT.put(tesseract, hasTesseract);
+ LOG.debug("hasTesseract (path: " + checkCmd + "): " + hasTesseract);
return hasTesseract;
-
+ }
+
+ boolean hasImageMagick() throws TikaConfigException {
+ // Fetch where the config says to find ImageMagick Program
+ String fullImageMagickPath = imageMagickPath + getImageMagickProg();
+
+ //check that directory exists
+ if (!StringUtils.isBlank(imageMagickPath) &&
+ !Files.isDirectory(Paths.get(imageMagickPath))) {
+ throw new TikaConfigException("imageMagickPath (" + imageMagickPath + ") " +
+ "doesn't point to an existing directory");
+ }
+
+ // Try running ImageMagick program from there, and see if it exists + works
+ String[] checkCmd = {fullImageMagickPath};
+ boolean hasImageMagick = ExternalParser.check(checkCmd);
+ if (!hasImageMagick) {
+ LOG.debug("ImageMagick does not appear to be installed " +
+ "(commandline: " + fullImageMagickPath + ")");
+ }
+
+ return hasImageMagick;
+
+ }
+
+ public static String getImageMagickProg() {
+ return System.getProperty("os.name").startsWith("Windows") ?
+ "magick" : "convert";
}
public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
@@ -202,13 +220,18 @@ public class TesseractOCRParser extends AbstractParser {
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext)
throws IOException, SAXException, TikaException {
- TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, defaultConfig);
+ TesseractOCRConfig userConfig = parseContext.get(TesseractOCRConfig.class);
+ TesseractOCRConfig config = defaultConfig;
+ if (userConfig != null) {
+ config = defaultConfig.cloneAndUpdate(userConfig);
+ }
// If Tesseract is not on the path with the current config, do not try to run OCR
// getSupportedTypes shouldn't have listed us as handling it, so this should only
// occur if someone directly calls this parser, not via DefaultParser or similar
- if (! hasTesseract(config))
+ if (!hasTesseract || (config != null && config.isSkipOcr())) {
return;
+ }
TemporaryResources tmp = new TemporaryResources();
try {
@@ -228,7 +251,7 @@ public class TesseractOCRParser extends AbstractParser {
tmp.dispose();
}
}
-
+
private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile,
ContentHandler xhtml, Metadata metadata, ParseContext parseContext,
TesseractOCRConfig config)
@@ -241,9 +264,9 @@ public class TesseractOCRParser extends AbstractParser {
if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
- // Process image
- if (config.isEnableImageProcessing() || config.isApplyRotation()) {
- if (! ImagePreprocessor.hasImageMagick(config)) {
+ // Process image
+ if (config.isEnableImageProcessing() || config.isApplyRotation()) {
+ if (!hasImageMagick) {
LOG.warn("User has selected to preprocess images, but I can't find ImageMagick." +
"Backing off to original file.");
doOCR(input.toFile(), tmpOCROutputFile, config);
@@ -254,11 +277,11 @@ public class TesseractOCRParser extends AbstractParser {
try (TemporaryResources tmp = new TemporaryResources()) {
Path tmpFile = tmp.createTempFile();
Files.copy(input, tmpFile, StandardCopyOption.REPLACE_EXISTING);
- IMAGE_PREPROCESSOR.process(tmpFile, tmpFile, metadata, config);
+ imagePreprocessor.process(tmpFile, tmpFile, metadata, config);
doOCR(tmpFile.toFile(), tmpOCROutputFile, config);
}
}
- } else {
+ } else {
doOCR(input.toFile(), tmpOCROutputFile, config);
}
@@ -292,20 +315,20 @@ public class TesseractOCRParser extends AbstractParser {
/**
* Run external tesseract-ocr process.
*
- * @param input
- * File to be ocred
- * @param output
- * File to collect ocr result
- * @param config
- * Configuration of tesseract-ocr engine
- * @throws TikaException
- * if the extraction timed out
- * @throws IOException
- * if an input error occurred
+ * @param input File to be ocred
+ * @param output File to collect ocr result
+ * @param config Configuration of tesseract-ocr engine
+ * @throws TikaException if the extraction timed out
+ * @throws IOException if an input error occurred
*/
private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
+ if (langs.size() > 0 && ! langs.contains(config.getLanguage())) {
+ throw new IllegalArgumentException("Couldn't find language "+
+ config.getLanguage() +" upon initialization. I did find: "
+ + langs);
+ }
ArrayList<String> cmd = new ArrayList<>(Arrays.asList(
- config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
+ getTesseractPath().toString() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
config.getLanguage(), "--psm", config.getPageSegMode()
));
for (Map.Entry<String, String> entry : config.getOtherTesseractConfig().entrySet()) {
@@ -315,18 +338,18 @@ public class TesseractOCRParser extends AbstractParser {
cmd.addAll(Arrays.asList(
"-c", "page_separator=" + config.getPageSeparator(),
"-c",
- (config.isPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0",
+ (config.isPreserveInterwordSpacing()) ? "preserve_interword_spaces=1" : "preserve_interword_spaces=0",
config.getOutputType().name().toLowerCase(Locale.US)
));
LOG.debug("Tesseract command: " + String.join(" ", cmd));
-
+
ProcessBuilder pb = new ProcessBuilder(cmd);
- setEnv(config, pb);
+ setEnv(pb);
Process process = null;
try {
process = pb.start();
- runOCRProcess(process, config.getTimeout());
+ runOCRProcess(process, config.getTimeoutSeconds());
} finally {
if (process != null) {
process.destroyForcibly();
@@ -346,7 +369,7 @@ public class TesseractOCRParser extends AbstractParser {
int exitValue = Integer.MIN_VALUE;
try {
boolean finished = process.waitFor(timeout, TimeUnit.SECONDS);
- if (! finished) {
+ if (!finished) {
throw new TikaException("TesseractOCRParser timeout");
}
exitValue = process.exitValue();
@@ -359,7 +382,7 @@ public class TesseractOCRParser extends AbstractParser {
}
if (exitValue > 0) {
throw new TikaException("TesseractOCRParser bad exit value " +
- exitValue + " err msg: "+errBuilder.toString());
+ exitValue + " err msg: " + errBuilder.toString());
}
}
@@ -368,14 +391,10 @@ public class TesseractOCRParser extends AbstractParser {
* Reads the contents of the given stream and write it to the given XHTML
* content handler. The stream is closed once fully processed.
*
- * @param stream
- * Stream where is the result of ocr
- * @param xhtml
- * XHTML content handler
- * @throws SAXException
- * if the XHTML SAX events could not be handled
- * @throws IOException
- * if an input error occurred
+ * @param stream Stream where is the result of ocr
+ * @param xhtml XHTML content handler
+ * @throws SAXException if the XHTML SAX events could not be handled
+ * @throws IOException if an input error occurred
*/
private void extractOutput(InputStream stream, ContentHandler xhtml) throws SAXException, IOException {
// <div class="ocr"
@@ -437,7 +456,20 @@ public class TesseractOCRParser extends AbstractParser {
return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
}
+ @Override
+ public void initialize(Map<String, Param> params) throws TikaConfigException {
+ //no-op
+ }
+ @Override
+ public void checkInitialization(InitializableProblemHandler problemHandler)
+ throws TikaConfigException {
+ hasTesseract = hasTesseract();
+ hasImageMagick = hasImageMagick();
+ loadLangs();
+ imagePreprocessor = new ImagePreprocessor(
+ getImageMagickPath()+getImageMagickProg());
+ }
private static class HOCRPassThroughHandler extends DefaultHandler {
private final ContentHandler xhtml;
@@ -506,14 +538,77 @@ public class TesseractOCRParser extends AbstractParser {
HAS_WARNED = true;
}
+ /**
+ * Set the path to the Tesseract executable's directory, needed if it is not on system path.
+ * <p>
+ * Note that if you set this value, it is highly recommended that you also
+ * set the path to the 'tessdata' folder using {@link #setTessdataPath}.
+ * </p>
+ */
@Field
public void setTesseractPath(String tesseractPath) {
- defaultConfig.setTesseractPath(tesseractPath);
+ tesseractPath = FilenameUtils.normalize(tesseractPath);
+ if (!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator)) {
+ tesseractPath += File.separator;
+ }
+ this.tesseractPath = tesseractPath;
+ }
+
+ public String getTesseractPath() {
+ return tesseractPath;
}
+ /**
+ * Set the path to the 'tessdata' folder, which contains language files and config files. In some cases (such
+ * as on Windows), this folder is found in the Tesseract installation, but in other cases
+ * (such as when Tesseract is built from source), it may be located elsewhere.
+ */
@Field
public void setTessdataPath(String tessdataPath) {
- defaultConfig.setTessdataPath(tessdataPath);
+ tessdataPath = FilenameUtils.normalize(tessdataPath);
+ if (!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator))
+ tessdataPath += File.separator;
+
+ this.tessdataPath = tessdataPath;
+ }
+
+ public String getTessdataPath() {
+ return this.tessdataPath;
+ }
+
+ /**
+ * Set the path to the ImageMagick executable directory, needed if it is not on system path.
+ *
+ * @param imageMagickPath to ImageMagick executable directory.
+ */
+ @Field
+ public void setImageMagickPath(String imageMagickPath) {
+ imageMagickPath = FilenameUtils.normalize(imageMagickPath);
+ if (!imageMagickPath.isEmpty() && !imageMagickPath.endsWith(File.separator)) {
+ imageMagickPath += File.separator;
+ }
+ this.imageMagickPath = imageMagickPath;
+ }
+
+ public String getImageMagickPath() {
+ return imageMagickPath;
+ }
+
+ @Field
+ public void setOtherTesseractSettings(List<String> settings) throws TikaConfigException {
+ for (String s : settings) {
+ String[] bits = s.trim().split("\\s+");
+ if (bits.length != 2) {
+ throw new TikaConfigException("Expected space delimited key value pair."+
+ " However, I found "+bits.length+" bits.");
+ }
+ defaultConfig.addOtherTesseractConfig(bits[0], bits[1]);
+ }
+ }
+
+ @Field
+ public void setSkipOCR(boolean skipOCR) {
+ defaultConfig.setSkipOcr(skipOCR);
}
@Field
@@ -538,7 +633,7 @@ public class TesseractOCRParser extends AbstractParser {
@Field
public void setTimeout(int timeout) {
- defaultConfig.setTimeout(timeout);
+ defaultConfig.setTimeoutSeconds(timeout);
}
@Field
@@ -557,11 +652,6 @@ public class TesseractOCRParser extends AbstractParser {
}
@Field
- public void setImageMagickPath(String imageMagickPath) {
- defaultConfig.setImageMagickPath(imageMagickPath);
- }
-
- @Field
public void setDensity(int density) {
defaultConfig.setDensity(density);
}
@@ -594,5 +684,37 @@ public class TesseractOCRParser extends AbstractParser {
public TesseractOCRConfig getDefaultConfig() {
return defaultConfig;
}
+
+ private void loadLangs() throws TikaConfigException {
+
+ if (! hasTesseract) {
+ return;
+ }
+
+ Path actualTessdataPath = null;
+ if (!tessdataPath.isEmpty()) {
+ actualTessdataPath = Paths.get(tessdataPath);
+ } else if (!tesseractPath.isEmpty()) {
+ actualTessdataPath = Paths.get(tesseractPath, "tessdata");
+ } else {
+ return;
+ }
+ if (! Files.isDirectory(actualTessdataPath)) {
+ throw new TikaConfigException(actualTessdataPath + " is not a directory");
+ }
+ for (File f : actualTessdataPath.toFile().listFiles()) {
+ if (f.isFile() && f.getName().endsWith(".traineddata")) {
+ String lang = f.getName().replace(".traineddata", "");
+ langs.add(lang);
+ }
+ }
+ if (langs.size() == 0) {
+ throw new TikaConfigException("Could not identify any languages (files ending in .traineddata) "+
+ " in: "+actualTessdataPath.toAbsolutePath());
+ } else if (LOG.isDebugEnabled()) {
+ LOG.debug("found langs: "+langs);
+ }
+ }
+
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
deleted file mode 100644
index 7eb4792..0000000
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Tesseract properties
-tesseractPath=
-language=eng
-pageSegMode=1
-maxFileSizeToOcr=2147483647
-minFileSizeToOcr=0
-timeout=120
-#txt or hocr
-outputType=txt
-preserveInterwordSpacing=false
-
-# If true, correct image rotation
-applyRotation=false
-
-# properties for image pre-processing
-# to enable pre-processing, set enableImageProcessing to true. Requires ImageMagick
-enableImageProcessing=false
-ImageMagickPath=
-density=300
-depth=4
-colorspace=gray
-filter=triangle
-resize=200
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
index 59009aa..02fc149 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
@@ -16,11 +16,12 @@
*/
package org.apache.tika.parser.ocr;
-import org.apache.commons.lang3.SystemUtils;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.CompositeDetector;
+import org.apache.tika.parser.CompositeParser;
import org.junit.Test;
-import java.io.File;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
@@ -34,14 +35,11 @@ public class TesseractOCRConfigTest extends TikaTest {
@Test
public void testNoConfig() throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
- assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
- assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
assertEquals("Invalid default language value", "eng", config.getLanguage());
assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
assertEquals("Invalid default minFileSizeToOcr value", 0, config.getMinFileSizeToOcr());
assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
- assertEquals("Invalid default timeout value", 120, config.getTimeout());
- assertEquals("Invalid default ImageMagickPath value", "", config.getImageMagickPath());
+ assertEquals("Invalid default timeout value", 120, config.getTimeoutSeconds());
assertEquals("Invalid default density value", 300 , config.getDensity());
assertEquals("Invalid default depth value", 4 , config.getDepth());
assertEquals("Invalid default colorpsace value", "gray" , config.getColorspace());
@@ -53,17 +51,16 @@ public class TesseractOCRConfigTest extends TikaTest {
@Test
public void testPartialConfig() throws Exception {
- InputStream stream = getResourceAsStream("/test-properties/TesseractOCRConfig-partial.properties");
+ InputStream stream = getResourceAsStream("/test-configs/tika-config-tesseract-partial.xml");
- TesseractOCRConfig config = new TesseractOCRConfig(stream);
- assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
- assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
+ TesseractOCRParser parser = (TesseractOCRParser)
+ ((CompositeParser)new TikaConfig(stream).getParser()).getAllComponentParsers().get(0);
+ TesseractOCRConfig config = parser.getDefaultConfig();
assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
- assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
- assertEquals("Invalid default ImageMagickPath value", "", config.getImageMagickPath());
+ assertEquals("Invalid overridden timeout value", 240, config.getTimeoutSeconds());
assertEquals("Invalid overridden density value", 200 , config.getDensity());
assertEquals("Invalid overridden depth value", 8 , config.getDepth());
assertEquals("Invalid overridden filter value", "box" , config.getFilter());
@@ -74,19 +71,16 @@ public class TesseractOCRConfigTest extends TikaTest {
@Test
public void testFullConfig() throws Exception {
- InputStream stream = getResourceAsStream("/test-properties/TesseractOCRConfig-full.properties");
+ InputStream stream = getResourceAsStream("/test-configs/tika-config-tesseract-full.xml");
- TesseractOCRConfig config = new TesseractOCRConfig(stream);
- if(SystemUtils.IS_OS_UNIX) {
- //assertEquals("Invalid overridden tesseractPath value", "/opt/tesseract" + File.separator, config.getTesseractPath());
- //assertEquals("Invalid overridden tesseractPath value", "/usr/local/share" + File.separator, config.getTessdataPath());
- assertEquals("Invalid overridden ImageMagickPath value", "/usr/local/bin/", config.getImageMagickPath());
- }
- assertEquals("Invalid overridden language value", "eng", config.getLanguage());
+ TesseractOCRParser parser = (TesseractOCRParser)
+ ((CompositeParser)new TikaConfig(stream).getParser()).getAllComponentParsers().get(0);
+ TesseractOCRConfig config = parser.getDefaultConfig();
+ assertEquals("Invalid overridden language value", "ceb", config.getLanguage());
assertEquals("Invalid overridden pageSegMode value", "2", config.getPageSegMode());
assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
assertEquals("Invalid overridden maxFileSizeToOcr value", 2000000, config.getMaxFileSizeToOcr());
- assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
+ assertEquals("Invalid overridden timeout value", 240, config.getTimeoutSeconds());
assertEquals("Invalid overridden density value", 200 , config.getDensity());
assertEquals("Invalid overridden depth value", 8 , config.getDepth());
assertEquals("Invalid overridden filter value", "box" , config.getFilter());
@@ -172,14 +166,14 @@ public class TesseractOCRConfigTest extends TikaTest {
@Test(expected=IllegalArgumentException.class)
public void testDataPathCheck() {
- TesseractOCRConfig config = new TesseractOCRConfig();
- config.setTessdataPath("blah\u0000deblah");
+ TesseractOCRParser parser = new TesseractOCRParser();
+ parser.setTessdataPath("blah\u0000deblah");
}
@Test(expected=IllegalArgumentException.class)
public void testPathCheck() {
- TesseractOCRConfig config = new TesseractOCRConfig();
- config.setTesseractPath("blah\u0000deblah");
+ TesseractOCRParser parser = new TesseractOCRParser();
+ parser.setTesseractPath("blah\u0000deblah");
}
@Test(expected=IllegalArgumentException.class)
@@ -213,38 +207,10 @@ public class TesseractOCRConfigTest extends TikaTest {
config.addOtherTesseractConfig("good", "good");
}
- @Test
- public void testBogusPathCheck() {
- //allow path that doesn't actually exist
- TesseractOCRConfig config = new TesseractOCRConfig();
- config.setTesseractPath("blahdeblahblah");
- assertEquals("blahdeblahblah"+File.separator, config.getTesseractPath());
- }
-
- @Test
- public void testTrailingSlashInPathBehavior() {
-
- TesseractOCRConfig config = new TesseractOCRConfig();
- config.setTesseractPath("blah");
- assertEquals("blah"+File.separator, config.getTesseractPath());
- config.setTesseractPath("blah"+File.separator);
- assertEquals("blah"+File.separator, config.getTesseractPath());
- config.setTesseractPath("");
- assertEquals("", config.getTesseractPath());
-
- config.setTessdataPath("blahdata");
- assertEquals("blahdata"+File.separator, config.getTessdataPath());
- config.setTessdataPath("blahdata"+File.separator);
- assertEquals("blahdata"+File.separator, config.getTessdataPath());
- config.setTessdataPath("");
- assertEquals("", config.getTessdataPath());
-
- config.setImageMagickPath("imagemagickpath");
- assertEquals("imagemagickpath"+File.separator, config.getImageMagickPath());
- config.setImageMagickPath("imagemagickpath"+File.separator);
- assertEquals("imagemagickpath"+File.separator, config.getImageMagickPath());
- config.setImageMagickPath("");
- assertEquals("", config.getImageMagickPath());
+ @Test (expected = IllegalArgumentException.class)
+ public void testBadLanguageCode() throws Exception {
+ TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig();
+ tesseractOCRConfig.setLanguage("kerplekistani");
}
@Test(expected=IllegalArgumentException.class)
@@ -252,4 +218,29 @@ public class TesseractOCRConfigTest extends TikaTest {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setColorspace("someth!ng");
}
+
+ @Test
+ public void testUpdatingConfigs() throws Exception {
+ TesseractOCRConfig configA = new TesseractOCRConfig();
+ configA.setLanguage("eng");
+ configA.setMinFileSizeToOcr(100);
+ configA.setOutputType(TesseractOCRConfig.OUTPUT_TYPE.TXT);
+ configA.addOtherTesseractConfig("k1", "a1");
+ configA.addOtherTesseractConfig("k2", "a2");
+
+ TesseractOCRConfig configB = new TesseractOCRConfig();
+ configB.setLanguage("fra");
+ configB.setMinFileSizeToOcr(1000);
+ configB.setOutputType(TesseractOCRConfig.OUTPUT_TYPE.HOCR);
+ configB.addOtherTesseractConfig("k1", "b1");
+ configB.addOtherTesseractConfig("k2", "b2");
+
+ TesseractOCRConfig clone = configA.cloneAndUpdate(configB);
+ assertEquals("fra", clone.getLanguage());
+ assertEquals(1000, clone.getMinFileSizeToOcr());
+ assertEquals(TesseractOCRConfig.OUTPUT_TYPE.HOCR,
+ clone.getOutputType());
+ assertEquals("b1", clone.getOtherTesseractConfig().get("k1"));
+ assertEquals("b2", clone.getOtherTesseractConfig().get("k2"));
+ }
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 0124109..4c6ca30 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -18,13 +18,12 @@ package org.apache.tika.parser.ocr;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.image.BPGParser;
import org.apache.tika.parser.image.HeifParser;
import org.apache.tika.parser.image.ICNSParser;
@@ -36,6 +35,7 @@ import org.apache.tika.parser.image.WebPParser;
import org.junit.Assert;
import org.junit.Test;
+import java.io.File;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Set;
@@ -49,26 +49,11 @@ import static org.junit.Assume.assumeTrue;
public class TesseractOCRParserTest extends TikaTest {
- public static boolean canRun() {
- TesseractOCRConfig config = new TesseractOCRConfig();
- TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest();
- return tesseractOCRTest.canRun(config);
- }
-
- private boolean canRun(TesseractOCRConfig config) {
- String[] checkCmd = {config.getTesseractPath() + TesseractOCRParser.getTesseractProg()};
- // If Tesseract is not on the path, do not run the test.
- return ExternalParser.check(checkCmd);
+ public static boolean canRun() throws TikaConfigException {
+ TesseractOCRParser p = new TesseractOCRParser();
+ return p.hasTesseract();
}
- @Test
- public void testImageMagick() throws Exception {
- //TODO -- figure out what the original intention was for this test or remove it.
- TesseractOCRConfig config = new TesseractOCRConfig();
- assumeTrue(TesseractOCRParser.IMAGE_PREPROCESSOR.hasImageMagick(config));
- String[] CheckCmd = {config.getImageMagickPath() + TesseractOCRParser.IMAGE_PREPROCESSOR.getImageMagickProg()};
- assertTrue(ExternalParser.check(CheckCmd));
- }
@Test
public void testInterwordSpacing() throws Exception {
@@ -93,22 +78,7 @@ public class TesseractOCRParserTest extends TikaTest {
assertTrue(m.find());
}
- @Test (expected = TikaException.class)
- public void testBadLanguageCode() throws Exception {
- assumeTrue("can run OCR", canRun());
- TesseractOCRConfig tesseractOCRConfigconfig = new TesseractOCRConfig();
- tesseractOCRConfigconfig.setLanguage("zzz");
- ParseContext parseContext = new ParseContext();
- parseContext.set(TesseractOCRConfig.class, tesseractOCRConfigconfig);
-
- //with preserve interwordspacing "on"
- //allow some flexibility in case Tesseract is computing spaces
- //somewhat differently in different versions/OS's, etc.
- String xml = getXML("testOCR_spacing.png",
- getMetadata(MediaType.image("png")),
- parseContext).xml;
- }
private Metadata getMetadata(MediaType mediaType) {
Metadata metadata = new Metadata();
@@ -138,14 +108,27 @@ public class TesseractOCRParserTest extends TikaTest {
}
@Test
+ public void confirmRuntimeSkipOCR() throws Exception {
+ assumeTrue("can run OCR", canRun());
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ config.setSkipOcr(true);
+ ParseContext context = new ParseContext();
+ context.set(TesseractOCRConfig.class, config);
+ String xml = getXML("testTIFF_multipage.tif",
+ getMetadata(MediaType.image("tiff")), context).xml;
+ assertNotContained("Page 2", xml);
+ }
+
+ @Test
public void testPositiveRotateOCR() throws Exception {
+ TesseractOCRParser p = new TesseractOCRParser();
+ assumeTrue(canRun());
+ assumeTrue(p.hasImageMagick());
TesseractOCRConfig config = new TesseractOCRConfig();
- assumeTrue(TesseractOCRParser.IMAGE_PREPROCESSOR.hasImageMagick(config));
config.setApplyRotation(true);
config.setResize(100);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
- assumeTrue(canRun(config));
Metadata metadata = getMetadata(MediaType.image("png"));
String ocr = getText("testRotated+10.png", metadata, parseContext);
assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
@@ -156,13 +139,14 @@ public class TesseractOCRParserTest extends TikaTest {
@Test
public void testNegativeRotateOCR() throws Exception {
+ TesseractOCRParser p = new TesseractOCRParser();
+ assumeTrue(p.hasImageMagick());
TesseractOCRConfig config = new TesseractOCRConfig();
- assumeTrue(TesseractOCRParser.IMAGE_PREPROCESSOR.hasImageMagick(config));
config.setApplyRotation(true);
config.setResize(100);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
- assumeTrue(canRun(config));
+ assumeTrue(canRun());
Metadata metadata = getMetadata(MediaType.image("png"));
String ocr = getText("testRotated-10.png", metadata, parseContext);
assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
@@ -173,14 +157,14 @@ public class TesseractOCRParserTest extends TikaTest {
@Test
public void testConfig() throws Exception {
- try (InputStream is = getResourceAsStream("/org/apache/tika/config/TIKA-2705-tesseract.xml")) {
+ try (InputStream is = getResourceAsStream("/test-configs/TIKA-2705-tesseract.xml")) {
TikaConfig config = new TikaConfig(is);
Parser p = config.getParser();
Parser tesseractOCRParser = findParser(p, org.apache.tika.parser.ocr.TesseractOCRParser.class);
assertNotNull(tesseractOCRParser);
TesseractOCRConfig tesseractOCRConfig = ((TesseractOCRParser)tesseractOCRParser).getDefaultConfig();
- Assert.assertEquals(241, tesseractOCRConfig.getTimeout());
+ Assert.assertEquals(241, tesseractOCRConfig.getTimeoutSeconds());
Assert.assertEquals(TesseractOCRConfig.OUTPUT_TYPE.HOCR, tesseractOCRConfig.getOutputType());
Assert.assertEquals("ceb", tesseractOCRConfig.getLanguage());
Assert.assertEquals(false, tesseractOCRConfig.isApplyRotation());
@@ -188,6 +172,23 @@ public class TesseractOCRParserTest extends TikaTest {
}
}
+ @Test
+ public void testArbitraryParams() throws Exception {
+ try (InputStream is = getResourceAsStream("/test-configs/tika-config-tesseract-arbitrary.xml")) {
+ TikaConfig config = new TikaConfig(is);
+ Parser p = config.getParser();
+ Parser tesseractOCRParser = findParser(p, org.apache.tika.parser.ocr.TesseractOCRParser.class);
+ assertNotNull(tesseractOCRParser);
+ TesseractOCRConfig tesseractOCRConfig = ((TesseractOCRParser)tesseractOCRParser).getDefaultConfig();
+ Assert.assertEquals("0.75",
+ tesseractOCRConfig.getOtherTesseractConfig().get("textord_initialx_ile"));
+
+ Assert.assertEquals("0.15625",
+ tesseractOCRConfig.getOtherTesseractConfig().get("textord_noise_hfract"));
+ }
+ }
+
+
//to be used to figure out a) what image media types don't have ocr coverage and
// b) what ocr media types don't have dedicated image parsers
//this obv requires that tesseract be installed
@@ -236,4 +237,37 @@ public class TesseractOCRParserTest extends TikaTest {
}
}
+ @Test
+ public void testTrailingSlashInPathBehavior() {
+
+ TesseractOCRParser parser = new TesseractOCRParser();
+ parser.setTesseractPath("blah");
+ assertEquals("blah"+ File.separator, parser.getTesseractPath());
+ parser.setTesseractPath("blah"+File.separator);
+ assertEquals("blah"+File.separator, parser.getTesseractPath());
+ parser.setTesseractPath("");
+ assertEquals("", parser.getTesseractPath());
+
+ parser.setTessdataPath("blahdata");
+ assertEquals("blahdata"+File.separator, parser.getTessdataPath());
+ parser.setTessdataPath("blahdata"+File.separator);
+ assertEquals("blahdata"+File.separator, parser.getTessdataPath());
+ parser.setTessdataPath("");
+ assertEquals("", parser.getTessdataPath());
+
+ parser.setImageMagickPath("imagemagickpath");
+ assertEquals("imagemagickpath"+File.separator, parser.getImageMagickPath());
+ parser.setImageMagickPath("imagemagickpath"+File.separator);
+ assertEquals("imagemagickpath"+File.separator, parser.getImageMagickPath());
+ parser.setImageMagickPath("");
+ assertEquals("", parser.getImageMagickPath());
+ }
+
+ @Test
+ public void testBogusPathCheck() {
+ //allow path that doesn't actually exist
+ TesseractOCRParser parser = new TesseractOCRParser();
+ parser.setTesseractPath("blahdeblahblah");
+ assertEquals("blahdeblahblah"+File.separator, parser.getTesseractPath());
+ }
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/StringsConfig-full.properties b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/StringsConfig-full.properties
similarity index 100%
rename from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/StringsConfig-full.properties
rename to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/StringsConfig-full.properties
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/StringsConfig-partial.properties b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/StringsConfig-partial.properties
similarity index 100%
rename from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/StringsConfig-partial.properties
rename to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/StringsConfig-partial.properties
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/TIKA-2705-tesseract.xml
similarity index 94%
copy from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/TIKA-2705-tesseract.xml
index b5543e4..c357408 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/TIKA-2705-tesseract.xml
@@ -23,7 +23,6 @@
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
<params>
<param name="timeout" type="int">241</param>
-<!-- <param name="tesseractPath" type="string">/myspecial/tess</param> -->
<param name="outputType" type="string">hocr</param>
<param name="applyRotation" type="bool">false</param>
<param name="language" type="string">ceb</param>
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-arbitrary.xml
similarity index 68%
copy from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-arbitrary.xml
index b5543e4..78a935b 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-arbitrary.xml
@@ -17,16 +17,13 @@
-->
<properties>
<parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
- </parser>
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
<params>
- <param name="timeout" type="int">241</param>
-<!-- <param name="tesseractPath" type="string">/myspecial/tess</param> -->
- <param name="outputType" type="string">hocr</param>
- <param name="applyRotation" type="bool">false</param>
- <param name="language" type="string">ceb</param>
+ <!-- space delimited key-value pairs -->
+ <param name="otherTesseractSettings" type="list">
+ <string>textord_initialx_ile 0.75</string>
+ <string>textord_noise_hfract 0.15625</string>
+ </param>
</params>
</parser>
</parsers>
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-full.xml
similarity index 63%
copy from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-full.xml
index b5543e4..374427b 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-full.xml
@@ -17,16 +17,24 @@
-->
<properties>
<parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
- </parser>
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
<params>
- <param name="timeout" type="int">241</param>
-<!-- <param name="tesseractPath" type="string">/myspecial/tess</param> -->
+ <param name="timeout" type="int">240</param>
+ <param name="density" type="int">200</param>
+ <param name="depth" type="int">8</param>
+ <param name="pageSegMode" type="string">2</param>
+ <param name="resize" type="int">300</param>
+ <param name="minFileSizeToOcr" type="long">1</param>
+ <param name="maxFileSizeToOcr" type="long">2000000</param>
+ <param name="timeoutSeconds" type="int">240</param>
+
<param name="outputType" type="string">hocr</param>
- <param name="applyRotation" type="bool">false</param>
+ <param name="filter" type="string">box</param>
+ <param name="applyRotation" type="bool">true</param>
+ <param name="enableImageProcessing" type="bool">false</param>
+
<param name="language" type="string">ceb</param>
+
</params>
</parser>
</parsers>
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-partial.xml
similarity index 70%
rename from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
rename to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-partial.xml
index b5543e4..33ca5c1 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-partial.xml
@@ -17,16 +17,21 @@
-->
<properties>
<parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
- </parser>
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
<params>
- <param name="timeout" type="int">241</param>
-<!-- <param name="tesseractPath" type="string">/myspecial/tess</param> -->
+ <param name="timeout" type="int">240</param>
+ <param name="density" type="int">200</param>
+ <param name="depth" type="int">8</param>
+ <param name="resize" type="int">300</param>
+
+ <param name="minFileSizeToOcr" type="long">1</param>
<param name="outputType" type="string">hocr</param>
+ <param name="filter" type="string">box</param>
<param name="applyRotation" type="bool">false</param>
- <param name="language" type="string">ceb</param>
+ <param name="enableImageProcessing" type="bool">false</param>
+
+ <param name="language" type="string">fra+deu</param>
+
</params>
</parser>
</parsers>
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-full.properties b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-full.properties
deleted file mode 100644
index 8161abf..0000000
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-full.properties
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#tesseractPath=/opt/tesseract
-#tessdataPath=/usr/local/share
-language=eng
-pageSegMode=2
-maxFileSizeToOcr=2000000
-timeout=240
-minFileSizeToOcr=1
-
-ImageMagickPath=/usr/local/bin
-density=200
-depth=8
-filter=box
-resize=300
-applyRotation=true
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-partial.properties b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-partial.properties
deleted file mode 100644
index 31a800d..0000000
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-partial.properties
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-language=fra+deu
-timeout=240
-minFileSizeToOcr=1
-
-enableImageProcessing=false
-density=200
-depth=8
-filter=box
-resize=300
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index c53cb86..e719e7a 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -23,6 +23,7 @@ import static org.junit.Assume.assumeTrue;
import java.util.List;
import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -38,20 +39,16 @@ import org.junit.Test;
public class TesseractOCRParserTest extends TikaTest {
- public static boolean canRun() {
- TesseractOCRConfig config = new TesseractOCRConfig();
- TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest();
- return tesseractOCRTest.canRun(config);
+ public static boolean canRun() throws TikaConfigException {
+ TesseractOCRParser p = new TesseractOCRParser();
+ return p.hasTesseract();
}
- private boolean canRun(TesseractOCRConfig config) {
- String[] checkCmd = {config.getTesseractPath() + TesseractOCRParser.getTesseractProg()};
- // If Tesseract is not on the path, do not run the test.
- return ExternalParser.check(checkCmd);
- }
+
/*
- Check that if Tesseract is not found, the TesseractOCRParser claims to not support
+ Check that if Tesseract is told to skip OCR,
+ the TesseractOCRParser claims to not support
any file types. So, the standard image parser is called instead.
*/
@Test
@@ -61,11 +58,11 @@ public class TesseractOCRParserTest extends TikaTest {
MediaType png = MediaType.image("png");
// With an invalid path, will offer no types
- TesseractOCRConfig invalidConfig = new TesseractOCRConfig();
- invalidConfig.setTesseractPath("/made/up/path");
+ TesseractOCRConfig skipOcrConfig = new TesseractOCRConfig();
+ skipOcrConfig.setSkipOcr(true);
ParseContext parseContext = new ParseContext();
- parseContext.set(TesseractOCRConfig.class, invalidConfig);
+ parseContext.set(TesseractOCRConfig.class, skipOcrConfig);
// No types offered
Assert.assertEquals(0, parser.getSupportedTypes(parseContext).size());
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 56721eb..ee133d0 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -20,6 +20,7 @@ import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
@@ -54,11 +55,11 @@ public class PDFParserTest extends TikaTest {
private static Boolean hasTesseract = null;
- public static boolean canRunOCR() {
+ public static boolean canRunOCR() throws TikaConfigException {
if (hasTesseract != null) {
return hasTesseract;
}
- hasTesseract = new TesseractOCRParser().hasTesseract(new TesseractOCRConfig());
+ hasTesseract = new TesseractOCRParser().hasTesseract();
return hasTesseract;
}
@@ -386,6 +387,7 @@ public class PDFParserTest extends TikaTest {
//now override the max file size to ocr, and you should get text
ParseContext pc = new ParseContext();
TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig();
+ tesseractOCRConfig.setMaxFileSizeToOcr(10000000);
pc.set(TesseractOCRConfig.class, tesseractOCRConfig);
text = getText(getResourceAsStream("/test-documents/testOCR.pdf"), p, pc);
assertContains("Happy", text);