You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/09 21:02:29 UTC
[tika] branch main updated: TIKA-3297 -- remove .properties as an option for parsers...starting with tesseract; pdf in a followup commit.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 3f80efd  TIKA-3297 -- remove .properties as an option for parsers...starting with tesseract; pdf in a followup commit.
     new e8b47f7  Merge remote-tracking branch 'origin/main' into main
3f80efd is described below

commit 3f80efda2890673160492bd4b9d570c88ff89de9
Author: tballison <ta...@apache.org>
AuthorDate: Tue Feb 9 16:02:01 2021 -0500

    TIKA-3297 -- remove .properties as an option for parsers...starting with tesseract; pdf in a followup commit.
---
 .../java/org/apache/tika/config/TikaConfig.java    |   5 +-
 .../apache/tika/parser/ocr/ImagePreprocessor.java  |  50 +--
 .../apache/tika/parser/ocr/TesseractOCRConfig.java | 375 +++++----------------
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 314 +++++++++++------
 .../tika/parser/ocr/TesseractOCRConfig.properties  |  38 ---
 .../tika/parser/ocr/TesseractOCRConfigTest.java    | 105 +++---
 .../tika/parser/ocr/TesseractOCRParserTest.java    | 116 ++++---
 .../StringsConfig-full.properties                  |   0
 .../StringsConfig-partial.properties               |   0
 .../TIKA-2705-tesseract.xml                        |   1 -
 .../tika-config-tesseract-arbitrary.xml}           |  13 +-
 .../tika-config-tesseract-full.xml}                |  20 +-
 .../tika-config-tesseract-partial.xml}             |  17 +-
 .../TesseractOCRConfig-full.properties             |  29 --
 .../TesseractOCRConfig-partial.properties          |  24 --
 .../tika/parser/ocr/TesseractOCRParserTest.java    |  23 +-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |   6 +-
 17 files changed, 482 insertions(+), 654 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index e29a337..041a6a4 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -654,7 +654,8 @@ public class TikaConfig {
             List<T> loaded = new ArrayList<T>();
 
             // Find the children of the parent tag, if any
-            for (Element le : getTopLevelElementChildren(element, getParentTagName(), getLoaderTagName())) {T loadedChild = loadOne(le, mimeTypes, loader);
+            for (Element le : getTopLevelElementChildren(element, getParentTagName(), getLoaderTagName())) {
+                T loadedChild = loadOne(le, mimeTypes, loader);
                 if (loadedChild != null) loaded.add(loadedChild);
             }
 
@@ -742,7 +743,7 @@ public class TikaConfig {
                                 excludeChildren.add(loader.getServiceClass(getLoaderClass(), exclName));
                             } catch (ClassNotFoundException e) {
                                 //TIKA-3268 -- This should stop the world.
-                                throw new TikaConfigException("Class now found in -exclude list: " + exclName);
+                                throw new TikaConfigException("Class not found in -exclude list: " + exclName);
                             }
                         }
                     }
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
index 9c84227..764f16e 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/ImagePreprocessor.java
@@ -33,6 +33,7 @@ import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
+import java.io.Serializable;
 import java.nio.charset.Charset;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -47,45 +48,14 @@ import java.util.Map;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Stream;
 
-class ImagePreprocessor {
+class ImagePreprocessor implements Serializable {
     private static final Map<String, Boolean> IMAGE_MAGICK_PRESENT = new HashMap<>();
     private static final Logger LOG = LoggerFactory.getLogger(TesseractOCRParser.class);
     private static final double MINIMUM_DESKEW_THRESHOLD = 1.0D;
 
-    public static boolean hasImageMagick(TesseractOCRConfig config) {
-        // Fetch where the config says to find ImageMagick Program
-        String ImageMagick = getImageMagickPath(config);
-
-        // Have we already checked for a copy of ImageMagick Program there?
-        if (IMAGE_MAGICK_PRESENT.containsKey(ImageMagick)) {
-            return IMAGE_MAGICK_PRESENT.get(ImageMagick);
-        }
-        //prevent memory bloat
-        if (IMAGE_MAGICK_PRESENT.size() > 100) {
-            IMAGE_MAGICK_PRESENT.clear();
-        }
-        //check that directory exists
-        if (!config.getImageMagickPath().isEmpty() &&
-                ! Files.isDirectory(Paths.get(config.getImageMagickPath()))) {
-            IMAGE_MAGICK_PRESENT.put(ImageMagick, false);
-            return false;
-        }
-
-        // Try running ImageMagick program from there, and see if it exists + works
-        String[] checkCmd = { ImageMagick };
-        boolean hasImageMagick = ExternalParser.check(checkCmd);
-        if (!hasImageMagick) {
-            LOG.warn("ImageMagick does not appear to be installed " +
-                    "(commandline: "+ImageMagick+")");
-        }
-        IMAGE_MAGICK_PRESENT.put(ImageMagick, hasImageMagick);
-
-        return hasImageMagick;
-    }
-
-
-    private static String getImageMagickPath(TesseractOCRConfig config) {
-        return config.getImageMagickPath() + getImageMagickProg();
+    private final String fullImageMagickPath;
+    ImagePreprocessor(String fullImageMagickPath) {
+        this.fullImageMagickPath = fullImageMagickPath;
     }
 
 
@@ -100,10 +70,7 @@ class ImagePreprocessor {
 
         if (config.isEnableImageProcessing() || config.isApplyRotation() && angle != 0) {
             // process the image - parameter values can be set in TesseractOCRConfig.properties
-            CommandLine commandLine = new CommandLine(getImageMagickPath(config));
-            if (System.getProperty("os.name").startsWith("Windows")) {
-                commandLine.addArgument("convert");
-            }
+            CommandLine commandLine = new CommandLine(fullImageMagickPath);
 
             // Arguments for ImageMagick
             final List<String> density = Arrays.asList("-density", Integer.toString(config.getDensity()));
@@ -179,8 +146,5 @@ class ImagePreprocessor {
         return angle;
     }
 
-    public static String getImageMagickProg() {
-        return System.getProperty("os.name").startsWith("Windows") ?
-                "magick" : "convert";
-    }
+
 }
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 6c78cc1..ad68c4e 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -16,39 +16,36 @@
  */
 package org.apache.tika.parser.ocr;
 
-import org.apache.commons.io.FilenameUtils;
+import org.apache.tika.exception.TikaException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
 import java.io.Serializable;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
-import java.util.Properties;
+import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 /**
  * Configuration for TesseractOCRParser.
+ * This class is not thread safe and must be synchronized externally.
  * <p>
- * This allows to enable TesseractOCRParser and set its parameters:
- * <p>
- * TesseractOCRConfig config = new TesseractOCRConfig();<br>
- * config.setTesseractPath(tesseractFolder);<br>
- * parseContext.set(TesseractOCRConfig.class, config);<br>
- * </p>
- * <p>
- * Parameters can also be set by either editing the existing TesseractOCRConfig.properties file in,
- * tika-parser/src/main/resources/org/apache/tika/parser/ocr, or overriding it by creating your own
- * and placing it in the package org/apache/tika/parser/ocr on the classpath.
+ * This class will remember all set* field forever,
+ * and on {@link #cloneAndUpdate(TesseractOCRConfig)},
+ * it will update all the fields that have been set on the "update" config.
+ * So, for example, if you want to change language to "fra"
+ * from "eng" and then on another parse,
+ * you want to change depth to 5 on the same update object,
+ * but you expect the language to revert to "eng", you'll be wrong.
+ * Create a new update config for each parse unless you're only changing the
+ * same field(s) with every parse.
  */
 public class TesseractOCRConfig implements Serializable {
 
@@ -67,14 +64,6 @@ public class TesseractOCRConfig implements Serializable {
         HOCR
     }
 
-    // Path to tesseract installation folder, if not on system path.
-    private String tesseractPath = "";
-
-    // Path to the 'tessdata' folder, which contains language files and config files.
-    private String tessdataPath = "";
-
-    private Path actualTessdataPath;
-
     // Language dictionary to be used.
     private String language = "eng";
 
@@ -88,7 +77,7 @@ public class TesseractOCRConfig implements Serializable {
     private long maxFileSizeToOcr = Integer.MAX_VALUE;
 
     // Maximum time (seconds) to wait for the ocring process termination
-    private int timeout = 120;
+    private int timeoutSeconds = 120;
 
     // The format of the ocr'ed output to be returned, txt or hocr.
     private OUTPUT_TYPE outputType = OUTPUT_TYPE.TXT;
@@ -96,9 +85,6 @@ public class TesseractOCRConfig implements Serializable {
     // enable image processing (optional)
     private boolean enableImageProcessing = false;
 
-    // Path to ImageMagick program, if not on system path.
-    private String imageMagickPath = "";
-
     // resolution of processed image (in dpi).
     private int density = 300;
 
@@ -125,128 +111,13 @@ public class TesseractOCRConfig implements Serializable {
     // whether or not to apply rotation calculated by the rotation.py script
     private boolean applyRotation = false;
 
+    // runtime switch to turn off OCR
+    private boolean skipOcr = false;
+
     // See addOtherTesseractConfig.
     private Map<String, String> otherTesseractConfig = new HashMap<>();
 
-
-    /**
-     * Default constructor.
-     */
-    public TesseractOCRConfig() {
-        init(this.getClass().getResourceAsStream("TesseractOCRConfig.properties"));
-    }
-
-    /**
-     * Loads properties from InputStream and then tries to close InputStream.
-     * If there is an IOException, this silently swallows the exception
-     * and goes back to the default.
-     *
-     * @param is
-     */
-    public TesseractOCRConfig(InputStream is) {
-        init(is);
-    }
-
-    private void init(InputStream is) {
-        if (is == null) {
-            return;
-        }
-        Properties props = new Properties();
-        try {
-            props.load(is);
-        } catch (IOException e) {
-        } finally {
-            if (is != null) {
-                try {
-                    is.close();
-                } catch (IOException e) {
-                    //swallow
-                }
-            }
-        }
-
-        // set parameters for Tesseract
-        setTesseractPath(
-                getProp(props, "tesseractPath", getTesseractPath()));
-        setTessdataPath(
-                getProp(props, "tessdataPath", getTessdataPath()));
-        setLanguage(
-                getProp(props, "language", getLanguage()));
-        setPageSegMode(
-                getProp(props, "pageSegMode", getPageSegMode()));
-        setMinFileSizeToOcr(
-                getProp(props, "minFileSizeToOcr", getMinFileSizeToOcr()));
-        setMaxFileSizeToOcr(
-                getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr()));
-        setTimeout(
-                getProp(props, "timeout", getTimeout()));
-        setOutputType(getProp(props, "outputType", getOutputType().toString()));
-        setPreserveInterwordSpacing(getProp(props, "preserveInterwordSpacing", false));
-
-        // set parameters for ImageMagick
-        setEnableImageProcessing(
-                getProp(props, "enableImageProcessing", isEnableImageProcessing()));
-        setImageMagickPath(
-                getProp(props, "ImageMagickPath", getImageMagickPath()));
-        setDensity(
-                getProp(props, "density", getDensity()));
-        setDepth(
-                getProp(props, "depth", getDepth()));
-        setColorspace(
-                getProp(props, "colorspace", getColorspace()));
-        setFilter(
-                getProp(props, "filter", getFilter()));
-        setResize(
-                getProp(props, "resize", getResize()));
-        setApplyRotation(
-                getProp(props, "applyRotation", isApplyRotation()));
-
-        loadOtherTesseractConfig(props);
-    }
-
-    /**
-     * @see #setTesseractPath(String tesseractPath)
-     */
-    public String getTesseractPath() {
-        return tesseractPath;
-    }
-
-    /**
-     * Set the path to the Tesseract executable's directory, needed if it is not on system path.
-     * <p>
-     * Note that if you set this value, it is highly recommended that you also
-     * set the path to the 'tessdata' folder using {@link #setTessdataPath}.
-     * </p>
-     */
-    public void setTesseractPath(String tesseractPath) {
-
-        tesseractPath = FilenameUtils.normalize(tesseractPath);
-        if (!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator))
-            tesseractPath += File.separator;
-
-        this.tesseractPath = tesseractPath;
-    }
-
-    /**
-     * @see #setTessdataPath(String tessdataPath)
-     */
-    public String getTessdataPath() {
-        return tessdataPath;
-    }
-
-    /**
-     * Set the path to the 'tessdata' folder, which contains language files and config files. In some cases (such
-     * as on Windows), this folder is found in the Tesseract installation, but in other cases
-     * (such as when Tesseract is built from source), it may be located elsewhere.
-     */
-    public void setTessdataPath(String tessdataPath) {
-        tessdataPath = FilenameUtils.normalize(tessdataPath);
-        if (!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator))
-            tessdataPath += File.separator;
-
-        this.tessdataPath = tessdataPath;
-    }
-
+    private Set<String> userConfigured = new HashSet<>();
     /**
      * @see #setLanguage(String language)
      */
@@ -279,8 +150,6 @@ public class TesseractOCRConfig implements Serializable {
             // First, make sure it conforms to the correct syntax
             if (!lang.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+")) {
                 invalidCodes.add(lang + " (invalid syntax)");
-            } else if (!langExists(lang)) {
-                invalidCodes.add(lang + " (not found)");
             }
         }
         if (!invalidCodes.isEmpty()) {
@@ -288,30 +157,7 @@ public class TesseractOCRConfig implements Serializable {
                     "Invalid language code(s): " + invalidCodes);
         }
         this.language = language;
-    }
-    /**
-     * Check if tessdata language model exists
-     */
-    private boolean langExists(String lang) {
-        if (actualTessdataPath == null) {
-            // Use the same logic used in TesseractOCRParser.setEnv().
-            // If tessdataPath is not specified then use tesseractPath, if specified
-            if (!tessdataPath.isEmpty()) {
-                actualTessdataPath = Paths.get(tessdataPath);
-            } else if (!tesseractPath.isEmpty()) {
-                actualTessdataPath = Paths.get(tesseractPath, "tessdata");
-            } else {
-                // Neither path was specified, so we'll just assume
-                // the language is good and rely on Tesseract to tell us if there's a problem
-                return true;
-            }
-        }
-
-        if (!Files.isDirectory(actualTessdataPath)) {
-            throw new IllegalArgumentException(actualTessdataPath + " is not a directory");
-        }
-        String trainedDataName = lang + ".traineddata";
-        return Files.isRegularFile(actualTessdataPath.resolve(trainedDataName));
+        userConfigured.add("language");
     }
 
     /**
@@ -330,6 +176,7 @@ public class TesseractOCRConfig implements Serializable {
             throw new IllegalArgumentException("Invalid page segmentation mode");
         }
         this.pageSegMode = pageSegMode;
+        userConfigured.add("pageSegMode");
     }
 
     /**
@@ -354,6 +201,7 @@ public class TesseractOCRConfig implements Serializable {
                     "If you trust this value, set it with setTrustedPageSeparator");
         }
         setTrustedPageSeparator(pageSeparator);
+        userConfigured.add("pageSeparator");
     }
 
     /**
@@ -373,6 +221,7 @@ public class TesseractOCRConfig implements Serializable {
      */
     public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing) {
         this.preserveInterwordSpacing = preserveInterwordSpacing;
+        userConfigured.add("preserveInterwordSpacing");
     }
 
     /**
@@ -395,6 +244,7 @@ public class TesseractOCRConfig implements Serializable {
      */
     public void setMinFileSizeToOcr(long minFileSizeToOcr) {
         this.minFileSizeToOcr = minFileSizeToOcr;
+        userConfigured.add("minFileSizeToOcr");
     }
 
     /**
@@ -410,22 +260,24 @@ public class TesseractOCRConfig implements Serializable {
      */
     public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
         this.maxFileSizeToOcr = maxFileSizeToOcr;
+        userConfigured.add("maxFileSizeToOcr");
     }
 
     /**
      * Set maximum time (seconds) to wait for the ocring process to terminate.
      * Default value is 120s.
      */
-    public void setTimeout(int timeout) {
-        this.timeout = timeout;
+    public void setTimeoutSeconds(int timeoutSeconds) {
+        this.timeoutSeconds = timeoutSeconds;
+        userConfigured.add("timeoutSeconds");
     }
 
     /**
      * @return timeout value for Tesseract
-     * @see #setTimeout(int timeout)
+     * @see #setTimeoutSeconds(int timeout)
      */
-    public int getTimeout() {
-        return timeout;
+    public int getTimeoutSeconds() {
+        return timeoutSeconds;
     }
 
     /**
@@ -434,6 +286,7 @@ public class TesseractOCRConfig implements Serializable {
      */
     public void setOutputType(OUTPUT_TYPE outputType) {
         this.outputType = outputType;
+        userConfigured.add("outputType");
     }
 
     public void setOutputType(String outputType) {
@@ -448,8 +301,6 @@ public class TesseractOCRConfig implements Serializable {
         } else {
             throw new IllegalArgumentException("outputType must be either 'txt' or 'hocr'");
         }
-
-
     }
 
     /**
@@ -473,6 +324,7 @@ public class TesseractOCRConfig implements Serializable {
      */
     public void setEnableImageProcessing(boolean enableImageProcessing) {
         this.enableImageProcessing = enableImageProcessing;
+        userConfigured.add("enableImageProcessing");
     }
 
     /**
@@ -491,6 +343,7 @@ public class TesseractOCRConfig implements Serializable {
             throw new IllegalArgumentException("Invalid density value. Valid range of values is 150-1200.");
         }
         this.density = density;
+        userConfigured.add("density");
     }
 
     /**
@@ -509,6 +362,7 @@ public class TesseractOCRConfig implements Serializable {
         for (int allowedValue : allowedValues) {
             if (depth == allowedValue) {
                 this.depth = depth;
+                userConfigured.add("depth");
                 return;
             }
         }
@@ -534,6 +388,7 @@ public class TesseractOCRConfig implements Serializable {
             throw new IllegalArgumentException("colorspace must match this pattern: (?i)^[-_A-Z0-9]+$");
         }
         this.colorspace = colorspace;
+        userConfigured.add("colorspace");
     }
 
     /**
@@ -557,6 +412,7 @@ public class TesseractOCRConfig implements Serializable {
         for (String allowedFilter : allowedFilters) {
             if (filter.equalsIgnoreCase(allowedFilter)) {
                 this.filter = filter;
+                userConfigured.add("filter");
                 return;
             }
         }
@@ -565,6 +421,20 @@ public class TesseractOCRConfig implements Serializable {
     }
 
     /**
+     * If you want to turn off OCR at run time for a specific file,
+     * set this to <code>true</code>
+     * @param skipOcr
+     */
+    public void setSkipOcr(boolean skipOcr) {
+        this.skipOcr = skipOcr;
+        userConfigured.add("skipOcr");
+    }
+
+    public boolean isSkipOcr() {
+        return skipOcr;
+    }
+
+    /**
      * @return the resize
      */
     public int getResize() {
@@ -579,6 +449,7 @@ public class TesseractOCRConfig implements Serializable {
         for (int i = 1; i < 10; i++) {
             if (resize == i * 100) {
                 this.resize = resize;
+                userConfigured.add("resize");
                 return;
             }
         }
@@ -586,29 +457,6 @@ public class TesseractOCRConfig implements Serializable {
     }
 
     /**
-     * @return path to ImageMagick executable directory.
-     * @see #setImageMagickPath(String imageMagickPath)
-     */
-    public String getImageMagickPath() {
-
-        return imageMagickPath;
-    }
-
-    /**
-     * Set the path to the ImageMagick executable directory, needed if it is not on system path.
-     *
-     * @param imageMagickPath to ImageMagick executable directory.
-     */
-    public void setImageMagickPath(String imageMagickPath) {
-        imageMagickPath = FilenameUtils.normalize(imageMagickPath);
-        if (!imageMagickPath.isEmpty() && !imageMagickPath.endsWith(File.separator)) {
-            imageMagickPath += File.separator;
-        }
-        this.imageMagickPath = imageMagickPath;
-    }
-
-
-    /**
      * @return Whether or not a rotation value should be calculated and passed to ImageMagick before performing OCR.
      */
     public boolean isApplyRotation() {
@@ -622,6 +470,7 @@ public class TesseractOCRConfig implements Serializable {
      */
     public void setApplyRotation(boolean applyRotation) {
         this.applyRotation = applyRotation;
+        userConfigured.add("applyRotation");
     }
 
     /**
@@ -658,93 +507,43 @@ public class TesseractOCRConfig implements Serializable {
         if (!m.find()) {
             throw new IllegalArgumentException("Value contains illegal characters: " + value);
         }
-
         otherTesseractConfig.put(key.trim(), value.trim());
+        userConfigured.add("otherTesseractConfig");
     }
 
-    /**
-     * Get property from the properties file passed in.
-     *
-     * @param properties     properties file to read from.
-     * @param property       the property to fetch.
-     * @param defaultMissing default parameter to use.
-     * @return the value.
-     */
-    private int getProp(Properties properties, String property, int defaultMissing) {
-        String p = properties.getProperty(property);
-        if (p == null || p.isEmpty()) {
-            return defaultMissing;
-        }
-        try {
-            return Integer.parseInt(p);
-        } catch (Throwable ex) {
-            throw new RuntimeException(String.format(Locale.ROOT, "Cannot parse TesseractOCRConfig variable %s, invalid integer value",
-                    property), ex);
-        }
-    }
-
-    /**
-     * Get property from the properties file passed in.
-     *
-     * @param properties     properties file to read from.
-     * @param property       the property to fetch.
-     * @param defaultMissing default parameter to use.
-     * @return the value.
-     */
-    private long getProp(Properties properties, String property, long defaultMissing) {
-        String p = properties.getProperty(property);
-        if (p == null || p.isEmpty()) {
-            return defaultMissing;
-        }
-        try {
-            return Integer.parseInt(p);
-        } catch (Throwable ex) {
-            throw new RuntimeException(String.format(Locale.ROOT, "Cannot parse TesseractOCRConfig variable %s, invalid integer value",
-                    property), ex);
-        }
-    }
-
-
-    /**
-     * Get property from the properties file passed in.
-     *
-     * @param properties     properties file to read from.
-     * @param property       the property to fetch.
-     * @param defaultMissing default parameter to use.
-     * @return the value.
-     */
-    private String getProp(Properties properties, String property, String defaultMissing) {
-        return properties.getProperty(property, defaultMissing);
-    }
-
-    private boolean getProp(Properties properties, String property, boolean defaultMissing) {
-        String propVal = properties.getProperty(property);
-        if (propVal == null) {
-            return defaultMissing;
-        }
-        if (propVal.equalsIgnoreCase("true")) {
-            return true;
-        } else if (propVal.equalsIgnoreCase("false")) {
-            return false;
-        }
-
-        throw new RuntimeException(String.format(Locale.ROOT,
-                "Cannot parse TesseractOCRConfig variable %s, invalid boolean value: %s",
-                property, propVal));
-    }
-
-    /**
-     * Populate otherTesseractConfig from the given properties.
-     * This assumes that any key-value pair where the key contains
-     * an underscore is an option to be passed opaquely to Tesseract.
-     *
-     * @param properties properties file to read from.
-     */
-    private void loadOtherTesseractConfig(Properties properties) {
-        for (String k : properties.stringPropertyNames()) {
-            if (k.contains("_")) {
-                addOtherTesseractConfig(k, properties.getProperty(k));
+    public TesseractOCRConfig cloneAndUpdate(TesseractOCRConfig updates) throws TikaException {
+        TesseractOCRConfig updated = new TesseractOCRConfig();
+        for (Field field : this.getClass().getDeclaredFields()) {
+            if (Modifier.isFinal(field.getModifiers())) {
+                continue;
+            } else if (Modifier.isStatic(field.getModifiers())) {
+                continue;
+            }
+            if ("userConfigured".equals(field.getName())) {
+                continue;
+            }
+            if ("otherTesseractConfig".equals(field.getName())
+                    && updates.userConfigured.contains(field.getName())) {
+                //deep copy
+                for (Map.Entry<String, String> e : updates.getOtherTesseractConfig().entrySet()) {
+                    updated.addOtherTesseractConfig(e.getKey(), e.getValue());
+                }
+                continue;
+            }
+            if (updates.userConfigured.contains(field.getName())) {
+                try {
+                    field.set(updated, field.get(updates));
+                } catch (IllegalAccessException e) {
+                    throw new TikaException("can't update " + field.getName(), e);
+                }
+            } else {
+                try {
+                    field.set(updated, field.get(this));
+                } catch (IllegalAccessException e) {
+                    throw new TikaException("can't update " + field.getName(), e);
+                }
             }
         }
+        return updated;
     }
 }
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 54d9388..6aea516 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.ocr;
 
+import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.tika.config.Field;
 import org.apache.tika.config.Initializable;
@@ -33,6 +34,7 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.external.ExternalParser;
 import org.apache.tika.sax.OfflineContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
 import org.apache.tika.utils.XMLReaderUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -60,19 +62,12 @@ import java.nio.file.StandardCopyOption;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.Future;
-import java.util.concurrent.FutureTask;
 import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
-import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.concurrent.atomic.AtomicInteger;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
@@ -88,13 +83,12 @@ import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
  * config.setTesseractPath(tesseractFolder);<br>
  * parseContext.set(TesseractOCRConfig.class, config);<br>
  * </p>
- *
- *
  */
-public class TesseractOCRParser extends AbstractParser {
+public class TesseractOCRParser extends AbstractParser implements Initializable {
+
     public static final String TESS_META = "tess:";
-    public static final Property IMAGE_ROTATION = Property.externalRealSeq(TESS_META+"rotation");
-    public static final Property IMAGE_MAGICK = Property.externalBooleanSeq(TESS_META+"image_magick_processed");
+    public static final Property IMAGE_ROTATION = Property.externalRealSeq(TESS_META + "rotation");
+    public static final Property IMAGE_MAGICK = Property.externalBooleanSeq(TESS_META + "image_magick_processed");
     private static final String OCR = "ocr-";
     private static final Logger LOG = LoggerFactory.getLogger(TesseractOCRParser.class);
 
@@ -105,79 +99,103 @@ public class TesseractOCRParser extends AbstractParser {
     private static final long serialVersionUID = -8167538283213097265L;
     private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
             new HashSet<>(Arrays.asList(new MediaType[]{
-                    MediaType.image(OCR+"png"),
-                    MediaType.image(OCR+"jpeg"),
-                    MediaType.image(OCR+"tiff"),
-                    MediaType.image(OCR+"bmp"),
-                    MediaType.image(OCR+"gif"),
+                    MediaType.image(OCR + "png"),
+                    MediaType.image(OCR + "jpeg"),
+                    MediaType.image(OCR + "tiff"),
+                    MediaType.image(OCR + "bmp"),
+                    MediaType.image(OCR + "gif"),
                     //these are not currently covered by other parsers
                     MediaType.image("jp2"),
                     MediaType.image("jpx"),
                     MediaType.image("x-portable-pixmap"),
                     //add the ocr- versions as well
-                    MediaType.image(OCR+"jp2"),
-                    MediaType.image(OCR+"jpx"),
-                    MediaType.image(OCR+"x-portable-pixmap"),
+                    MediaType.image(OCR + "jp2"),
+                    MediaType.image(OCR + "jpx"),
+                    MediaType.image(OCR + "x-portable-pixmap"),
 
             })));
+
+    private String tesseractPath = "";
+    private String tessdataPath = "";
+    private String imageMagickPath = "";
+    //if a user specifies a custom tess path or tessdata path
+    //load the available languages at initialization time
+    private final Set<String> langs = new HashSet<>();
+
     private final TesseractOCRConfig defaultConfig = new TesseractOCRConfig();
 
-    private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<>();
-    static final ImagePreprocessor IMAGE_PREPROCESSOR = new ImagePreprocessor();
+    private boolean hasTesseract;
+    private boolean hasImageMagick;
+    private ImagePreprocessor imagePreprocessor;
 
     @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         // If Tesseract is installed, offer our supported image types
-        TesseractOCRConfig config = context.get(TesseractOCRConfig.class, defaultConfig);
-        if (hasTesseract(config)) {
-            return SUPPORTED_TYPES;
+        TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
+        if (hasTesseract) {
+            if (config == null || !config.isSkipOcr()) {
+                return SUPPORTED_TYPES;
+            }
         }
         // Otherwise don't advertise anything, so the other image parsers
         //  can be selected instead
         return Collections.emptySet();
     }
 
-    private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
+    private void setEnv(ProcessBuilder pb) {
         String tessdataPrefix = "TESSDATA_PREFIX";
         Map<String, String> env = pb.environment();
 
-        if (!config.getTessdataPath().isEmpty()) {
-            env.put(tessdataPrefix, config.getTessdataPath());
-        }
-        else if(!config.getTesseractPath().isEmpty()) {
-            env.put(tessdataPrefix, config.getTesseractPath());
+        if (!StringUtils.isBlank(getTessdataPath())) {
+            env.put(tessdataPrefix, getTessdataPath());
+        } else if (!StringUtils.isBlank(getTesseractPath())) {
+            env.put(tessdataPrefix, getTesseractPath());
         }
     }
 
-    public boolean hasTesseract(TesseractOCRConfig config) {
+    public boolean hasTesseract() throws TikaConfigException {
         // Fetch where the config says to find Tesseract
-        String tesseract = config.getTesseractPath() + getTesseractProg();
-
-        // Have we already checked for a copy of Tesseract there?
-        if (TESSERACT_PRESENT.containsKey(tesseract)) {
-            return TESSERACT_PRESENT.get(tesseract);
-        }
-        //prevent memory bloat
-        if (TESSERACT_PRESENT.size() > 100) {
-            TESSERACT_PRESENT.clear();
-        }
-        //check that the parent directory exists
-        if (! config.getTesseractPath().isEmpty() &&
-                ! Files.isDirectory(Paths.get(config.getTesseractPath()))) {
-            TESSERACT_PRESENT.put(tesseract, false);
-            LOG.warn("You haven't specified an existing directory in " +
-                    "which the tesseract binary should be found: " +
-                    "(path:" + config.getTesseractPath()+")");
-            return false;
+        String tesseract = getTesseractPath() + getTesseractProg();
+
+        if (!StringUtils.isBlank(tesseractPath) &&
+                !Files.isDirectory(Paths.get(tesseractPath))) {
+            throw new TikaConfigException("tesseractPath (" + tesseractPath + ") " +
+                    "doesn't point to an existing directory");
         }
 
         // Try running Tesseract from there, and see if it exists + works
-        String[] checkCmd = { tesseract };
+        String[] checkCmd = {tesseract};
         boolean hasTesseract = ExternalParser.check(checkCmd);
-        LOG.debug("hasTesseract (path: "+checkCmd+"): "+hasTesseract);
-        TESSERACT_PRESENT.put(tesseract, hasTesseract);
+        LOG.debug("hasTesseract (path: " + checkCmd + "): " + hasTesseract);
         return hasTesseract;
-     
+    }
+
+    boolean hasImageMagick() throws TikaConfigException {
+        // Fetch where the config says to find ImageMagick Program
+        String fullImageMagickPath = imageMagickPath + getImageMagickProg();
+
+        //check that directory exists
+        if (!StringUtils.isBlank(imageMagickPath) &&
+                !Files.isDirectory(Paths.get(imageMagickPath))) {
+            throw new TikaConfigException("imageMagickPath (" + imageMagickPath + ") " +
+                    "doesn't point to an existing directory");
+        }
+
+        // Try running ImageMagick program from there, and see if it exists + works
+        String[] checkCmd = {fullImageMagickPath};
+        boolean hasImageMagick = ExternalParser.check(checkCmd);
+        if (!hasImageMagick) {
+            LOG.debug("ImageMagick does not appear to be installed " +
+                    "(commandline: " + fullImageMagickPath + ")");
+        }
+
+        return hasImageMagick;
+
+    }
+
+    public static String getImageMagickProg() {
+        return System.getProperty("os.name").startsWith("Windows") ?
+                "magick" : "convert";
     }
 
     public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
@@ -202,13 +220,18 @@ public class TesseractOCRParser extends AbstractParser {
     @Override
     public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext)
             throws IOException, SAXException, TikaException {
-        TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, defaultConfig);
 
+        TesseractOCRConfig userConfig = parseContext.get(TesseractOCRConfig.class);
+        TesseractOCRConfig config = defaultConfig;
+        if (userConfig != null) {
+            config = defaultConfig.cloneAndUpdate(userConfig);
+        }
         // If Tesseract is not on the path with the current config, do not try to run OCR
         // getSupportedTypes shouldn't have listed us as handling it, so this should only
         //  occur if someone directly calls this parser, not via DefaultParser or similar
-        if (! hasTesseract(config))
+        if (!hasTesseract || (config != null && config.isSkipOcr())) {
             return;
+        }
 
         TemporaryResources tmp = new TemporaryResources();
         try {
@@ -228,7 +251,7 @@ public class TesseractOCRParser extends AbstractParser {
             tmp.dispose();
         }
     }
-    
+
     private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile,
                        ContentHandler xhtml, Metadata metadata, ParseContext parseContext,
                        TesseractOCRConfig config)
@@ -241,9 +264,9 @@ public class TesseractOCRParser extends AbstractParser {
 
             if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
 
-            	// Process image
-            	if (config.isEnableImageProcessing() || config.isApplyRotation()) {
-                    if (! ImagePreprocessor.hasImageMagick(config)) {
+                // Process image
+                if (config.isEnableImageProcessing() || config.isApplyRotation()) {
+                    if (!hasImageMagick) {
                         LOG.warn("User has selected to preprocess images, but I can't find ImageMagick." +
                                 "Backing off to original file.");
                         doOCR(input.toFile(), tmpOCROutputFile, config);
@@ -254,11 +277,11 @@ public class TesseractOCRParser extends AbstractParser {
                         try (TemporaryResources tmp = new TemporaryResources()) {
                             Path tmpFile = tmp.createTempFile();
                             Files.copy(input, tmpFile, StandardCopyOption.REPLACE_EXISTING);
-                            IMAGE_PREPROCESSOR.process(tmpFile, tmpFile, metadata, config);
+                            imagePreprocessor.process(tmpFile, tmpFile, metadata, config);
                             doOCR(tmpFile.toFile(), tmpOCROutputFile, config);
                         }
                     }
-            	} else {
+                } else {
                     doOCR(input.toFile(), tmpOCROutputFile, config);
                 }
 
@@ -292,20 +315,20 @@ public class TesseractOCRParser extends AbstractParser {
     /**
      * Run external tesseract-ocr process.
      *
-     * @param input
-     *          File to be ocred
-     * @param output
-     *          File to collect ocr result
-     * @param config
-     *          Configuration of tesseract-ocr engine
-     * @throws TikaException
-     *           if the extraction timed out
-     * @throws IOException
-     *           if an input error occurred
+     * @param input  File to be ocred
+     * @param output File to collect ocr result
+     * @param config Configuration of tesseract-ocr engine
+     * @throws TikaException if the extraction timed out
+     * @throws IOException   if an input error occurred
      */
     private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
+        if (langs.size() > 0 && ! langs.contains(config.getLanguage())) {
+            throw new IllegalArgumentException("Couldn't find language "+
+                    config.getLanguage() +" upon initialization. I did find: "
+                    + langs);
+        }
         ArrayList<String> cmd = new ArrayList<>(Arrays.asList(
-                config.getTesseractPath() + getTesseractProg(), input.getPath(),  output.getPath(), "-l",
+                getTesseractPath().toString() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
                 config.getLanguage(), "--psm", config.getPageSegMode()
         ));
         for (Map.Entry<String, String> entry : config.getOtherTesseractConfig().entrySet()) {
@@ -315,18 +338,18 @@ public class TesseractOCRParser extends AbstractParser {
         cmd.addAll(Arrays.asList(
                 "-c", "page_separator=" + config.getPageSeparator(),
                 "-c",
-                (config.isPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0",
+                (config.isPreserveInterwordSpacing()) ? "preserve_interword_spaces=1" : "preserve_interword_spaces=0",
                 config.getOutputType().name().toLowerCase(Locale.US)
         ));
         LOG.debug("Tesseract command: " + String.join(" ", cmd));
-        
+
         ProcessBuilder pb = new ProcessBuilder(cmd);
-        setEnv(config, pb);
+        setEnv(pb);
 
         Process process = null;
         try {
             process = pb.start();
-            runOCRProcess(process, config.getTimeout());
+            runOCRProcess(process, config.getTimeoutSeconds());
         } finally {
             if (process != null) {
                 process.destroyForcibly();
@@ -346,7 +369,7 @@ public class TesseractOCRParser extends AbstractParser {
         int exitValue = Integer.MIN_VALUE;
         try {
             boolean finished = process.waitFor(timeout, TimeUnit.SECONDS);
-            if (! finished) {
+            if (!finished) {
                 throw new TikaException("TesseractOCRParser timeout");
             }
             exitValue = process.exitValue();
@@ -359,7 +382,7 @@ public class TesseractOCRParser extends AbstractParser {
         }
         if (exitValue > 0) {
             throw new TikaException("TesseractOCRParser bad exit value " +
-                    exitValue + " err msg: "+errBuilder.toString());
+                    exitValue + " err msg: " + errBuilder.toString());
         }
 
     }
@@ -368,14 +391,10 @@ public class TesseractOCRParser extends AbstractParser {
      * Reads the contents of the given stream and write it to the given XHTML
      * content handler. The stream is closed once fully processed.
      *
-     * @param stream
-     *          Stream where is the result of ocr
-     * @param xhtml
-     *          XHTML content handler
-     * @throws SAXException
-     *           if the XHTML SAX events could not be handled
-     * @throws IOException
-     *           if an input error occurred
+     * @param stream Stream where is the result of ocr
+     * @param xhtml  XHTML content handler
+     * @throws SAXException if the XHTML SAX events could not be handled
+     * @throws IOException  if an input error occurred
      */
     private void extractOutput(InputStream stream, ContentHandler xhtml) throws SAXException, IOException {
         //        <div class="ocr"
@@ -437,7 +456,20 @@ public class TesseractOCRParser extends AbstractParser {
         return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
     }
 
+    @Override
+    public void initialize(Map<String, Param> params) throws TikaConfigException {
+        //no-op
+    }
 
+    @Override
+    public void checkInitialization(InitializableProblemHandler problemHandler)
+            throws TikaConfigException {
+        hasTesseract = hasTesseract();
+        hasImageMagick = hasImageMagick();
+        loadLangs();
+        imagePreprocessor = new ImagePreprocessor(
+                getImageMagickPath()+getImageMagickProg());
+    }
 
     private static class HOCRPassThroughHandler extends DefaultHandler {
         private final ContentHandler xhtml;
@@ -506,14 +538,77 @@ public class TesseractOCRParser extends AbstractParser {
         HAS_WARNED = true;
     }
 
+    /**
+     * Set the path to the Tesseract executable's directory, needed if it is not on system path.
+     * <p>
+     * Note that if you set this value, it is highly recommended that you also
+     * set the path to the 'tessdata' folder using {@link #setTessdataPath}.
+     * </p>
+     */
     @Field
     public void setTesseractPath(String tesseractPath) {
-        defaultConfig.setTesseractPath(tesseractPath);
+        tesseractPath = FilenameUtils.normalize(tesseractPath);
+        if (!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator)) {
+            tesseractPath += File.separator;
+        }
+        this.tesseractPath = tesseractPath;
+    }
+
+    public String getTesseractPath() {
+        return tesseractPath;
     }
 
+    /**
+     * Set the path to the 'tessdata' folder, which contains language files and config files. In some cases (such
+     * as on Windows), this folder is found in the Tesseract installation, but in other cases
+     * (such as when Tesseract is built from source), it may be located elsewhere.
+     */
     @Field
     public void setTessdataPath(String tessdataPath) {
-        defaultConfig.setTessdataPath(tessdataPath);
+        tessdataPath = FilenameUtils.normalize(tessdataPath);
+        if (!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator))
+            tessdataPath += File.separator;
+
+        this.tessdataPath = tessdataPath;
+    }
+
+    public String getTessdataPath() {
+        return this.tessdataPath;
+    }
+
+    /**
+     * Set the path to the ImageMagick executable directory, needed if it is not on system path.
+     *
+     * @param imageMagickPath to ImageMagick executable directory.
+     */
+    @Field
+    public void setImageMagickPath(String imageMagickPath) {
+        imageMagickPath = FilenameUtils.normalize(imageMagickPath);
+        if (!imageMagickPath.isEmpty() && !imageMagickPath.endsWith(File.separator)) {
+            imageMagickPath += File.separator;
+        }
+        this.imageMagickPath = imageMagickPath;
+    }
+
+    public String getImageMagickPath() {
+        return imageMagickPath;
+    }
+
+    @Field
+    public void setOtherTesseractSettings(List<String> settings) throws TikaConfigException {
+        for (String s : settings) {
+            String[] bits = s.trim().split("\\s+");
+            if (bits.length != 2) {
+                throw new TikaConfigException("Expected space delimited key value pair."+
+                        " However, I found "+bits.length+" bits.");
+            }
+            defaultConfig.addOtherTesseractConfig(bits[0], bits[1]);
+        }
+    }
+
+    @Field
+    public void setSkipOCR(boolean skipOCR) {
+        defaultConfig.setSkipOcr(skipOCR);
     }
 
     @Field
@@ -538,7 +633,7 @@ public class TesseractOCRParser extends AbstractParser {
 
     @Field
     public void setTimeout(int timeout) {
-        defaultConfig.setTimeout(timeout);
+        defaultConfig.setTimeoutSeconds(timeout);
     }
 
     @Field
@@ -557,11 +652,6 @@ public class TesseractOCRParser extends AbstractParser {
     }
 
     @Field
-    public void setImageMagickPath(String imageMagickPath) {
-        defaultConfig.setImageMagickPath(imageMagickPath);
-    }
-
-    @Field
     public void setDensity(int density) {
         defaultConfig.setDensity(density);
     }
@@ -594,5 +684,37 @@ public class TesseractOCRParser extends AbstractParser {
     public TesseractOCRConfig getDefaultConfig() {
         return defaultConfig;
     }
+
+    private void loadLangs() throws TikaConfigException {
+
+        if (! hasTesseract) {
+            return;
+        }
+
+        Path actualTessdataPath = null;
+        if (!tessdataPath.isEmpty()) {
+            actualTessdataPath = Paths.get(tessdataPath);
+        } else if (!tesseractPath.isEmpty()) {
+            actualTessdataPath = Paths.get(tesseractPath, "tessdata");
+        } else {
+            return;
+        }
+        if (! Files.isDirectory(actualTessdataPath)) {
+            throw new TikaConfigException(actualTessdataPath + " is not a directory");
+        }
+        for (File f : actualTessdataPath.toFile().listFiles()) {
+            if (f.isFile() && f.getName().endsWith(".traineddata")) {
+                String lang = f.getName().replace(".traineddata", "");
+                langs.add(lang);
+            }
+        }
+        if (langs.size() == 0) {
+            throw new TikaConfigException("Could not identify any languages (files ending in .traineddata) "+
+                    " in: "+actualTessdataPath.toAbsolutePath());
+        } else if (LOG.isDebugEnabled()) {
+            LOG.debug("found langs: "+langs);
+        }
+    }
+
 }
 
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
deleted file mode 100644
index 7eb4792..0000000
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
+++ /dev/null
@@ -1,38 +0,0 @@
-#  Licensed to the Apache Software Foundation (ASF) under one or more
-#  contributor license agreements.  See the NOTICE file distributed with
-#  this work for additional information regarding copyright ownership.
-#  The ASF licenses this file to You under the Apache License, Version 2.0
-#  (the "License"); you may not use this file except in compliance with
-#  the License.  You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-# Tesseract properties
-tesseractPath=
-language=eng
-pageSegMode=1
-maxFileSizeToOcr=2147483647
-minFileSizeToOcr=0
-timeout=120
-#txt or hocr
-outputType=txt
-preserveInterwordSpacing=false
-
-# If true, correct image rotation
-applyRotation=false
-
-# properties for image pre-processing
-# to enable pre-processing, set enableImageProcessing to true.  Requires ImageMagick
-enableImageProcessing=false
-ImageMagickPath=
-density=300
-depth=4
-colorspace=gray
-filter=triangle
-resize=200
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
index 59009aa..02fc149 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
@@ -16,11 +16,12 @@
  */
 package org.apache.tika.parser.ocr;
 
-import org.apache.commons.lang3.SystemUtils;
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.CompositeDetector;
+import org.apache.tika.parser.CompositeParser;
 import org.junit.Test;
 
-import java.io.File;
 import java.io.InputStream;
 import java.util.Arrays;
 import java.util.List;
@@ -34,14 +35,11 @@ public class TesseractOCRConfigTest extends TikaTest {
     @Test
     public void testNoConfig() throws Exception {
         TesseractOCRConfig config = new TesseractOCRConfig();
-        assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
-        assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
         assertEquals("Invalid default language value", "eng", config.getLanguage());
         assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
         assertEquals("Invalid default minFileSizeToOcr value", 0, config.getMinFileSizeToOcr());
         assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
-        assertEquals("Invalid default timeout value", 120, config.getTimeout());  
-        assertEquals("Invalid default ImageMagickPath value", "", config.getImageMagickPath());
+        assertEquals("Invalid default timeout value", 120, config.getTimeoutSeconds());
         assertEquals("Invalid default density value", 300 , config.getDensity());
         assertEquals("Invalid default depth value", 4 , config.getDepth());
         assertEquals("Invalid default colorpsace value", "gray" , config.getColorspace());
@@ -53,17 +51,16 @@ public class TesseractOCRConfigTest extends TikaTest {
     @Test
     public void testPartialConfig() throws Exception {
 
-        InputStream stream = getResourceAsStream("/test-properties/TesseractOCRConfig-partial.properties");
+        InputStream stream = getResourceAsStream("/test-configs/tika-config-tesseract-partial.xml");
 
-        TesseractOCRConfig config = new TesseractOCRConfig(stream);
-        assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
-        assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
+        TesseractOCRParser parser = (TesseractOCRParser)
+                ((CompositeParser)new TikaConfig(stream).getParser()).getAllComponentParsers().get(0);
+        TesseractOCRConfig config = parser.getDefaultConfig();
         assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
         assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
         assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
         assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
-        assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
-        assertEquals("Invalid default ImageMagickPath value", "", config.getImageMagickPath());
+        assertEquals("Invalid overridden timeout value", 240, config.getTimeoutSeconds());
         assertEquals("Invalid overridden density value", 200 , config.getDensity());
         assertEquals("Invalid overridden depth value", 8 , config.getDepth());
         assertEquals("Invalid overridden filter value", "box" , config.getFilter());	
@@ -74,19 +71,16 @@ public class TesseractOCRConfigTest extends TikaTest {
     @Test
     public void testFullConfig() throws Exception {
 
-        InputStream stream = getResourceAsStream("/test-properties/TesseractOCRConfig-full.properties");
+        InputStream stream = getResourceAsStream("/test-configs/tika-config-tesseract-full.xml");
 
-        TesseractOCRConfig config = new TesseractOCRConfig(stream);
-        if(SystemUtils.IS_OS_UNIX) {
-        	//assertEquals("Invalid overridden tesseractPath value", "/opt/tesseract" + File.separator, config.getTesseractPath());
-            //assertEquals("Invalid overridden tesseractPath value", "/usr/local/share" + File.separator, config.getTessdataPath());
-        	assertEquals("Invalid overridden ImageMagickPath value", "/usr/local/bin/", config.getImageMagickPath());
-        }
-        assertEquals("Invalid overridden language value", "eng", config.getLanguage());
+        TesseractOCRParser parser = (TesseractOCRParser)
+                ((CompositeParser)new TikaConfig(stream).getParser()).getAllComponentParsers().get(0);
+        TesseractOCRConfig config = parser.getDefaultConfig();
+        assertEquals("Invalid overridden language value", "ceb", config.getLanguage());
         assertEquals("Invalid overridden pageSegMode value", "2", config.getPageSegMode());
         assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
         assertEquals("Invalid overridden maxFileSizeToOcr value", 2000000, config.getMaxFileSizeToOcr());
-        assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
+        assertEquals("Invalid overridden timeout value", 240, config.getTimeoutSeconds());
         assertEquals("Invalid overridden density value", 200 , config.getDensity());
         assertEquals("Invalid overridden depth value", 8 , config.getDepth());
         assertEquals("Invalid overridden filter value", "box" , config.getFilter());
@@ -172,14 +166,14 @@ public class TesseractOCRConfigTest extends TikaTest {
 
     @Test(expected=IllegalArgumentException.class)
     public void testDataPathCheck() {
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        config.setTessdataPath("blah\u0000deblah");
+        TesseractOCRParser parser = new TesseractOCRParser();
+        parser.setTessdataPath("blah\u0000deblah");
     }
 
     @Test(expected=IllegalArgumentException.class)
     public void testPathCheck() {
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        config.setTesseractPath("blah\u0000deblah");
+        TesseractOCRParser parser = new TesseractOCRParser();
+        parser.setTesseractPath("blah\u0000deblah");
     }
 
     @Test(expected=IllegalArgumentException.class)
@@ -213,38 +207,10 @@ public class TesseractOCRConfigTest extends TikaTest {
         config.addOtherTesseractConfig("good", "good");
     }
 
-    @Test
-    public void testBogusPathCheck() {
-        //allow path that doesn't actually exist
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        config.setTesseractPath("blahdeblahblah");
-        assertEquals("blahdeblahblah"+File.separator, config.getTesseractPath());
-    }
-
-    @Test
-    public void testTrailingSlashInPathBehavior() {
-
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        config.setTesseractPath("blah");
-        assertEquals("blah"+File.separator, config.getTesseractPath());
-        config.setTesseractPath("blah"+File.separator);
-        assertEquals("blah"+File.separator, config.getTesseractPath());
-        config.setTesseractPath("");
-        assertEquals("", config.getTesseractPath());
-
-        config.setTessdataPath("blahdata");
-        assertEquals("blahdata"+File.separator, config.getTessdataPath());
-        config.setTessdataPath("blahdata"+File.separator);
-        assertEquals("blahdata"+File.separator, config.getTessdataPath());
-        config.setTessdataPath("");
-        assertEquals("", config.getTessdataPath());
-
-        config.setImageMagickPath("imagemagickpath");
-        assertEquals("imagemagickpath"+File.separator, config.getImageMagickPath());
-        config.setImageMagickPath("imagemagickpath"+File.separator);
-        assertEquals("imagemagickpath"+File.separator, config.getImageMagickPath());
-        config.setImageMagickPath("");
-        assertEquals("", config.getImageMagickPath());
+    @Test (expected = IllegalArgumentException.class)
+    public void testBadLanguageCode() throws Exception {
+        TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig();
+        tesseractOCRConfig.setLanguage("kerplekistani");
     }
 
     @Test(expected=IllegalArgumentException.class)
@@ -252,4 +218,29 @@ public class TesseractOCRConfigTest extends TikaTest {
         TesseractOCRConfig config = new TesseractOCRConfig();
         config.setColorspace("someth!ng");
     }
+
+    @Test
+    public void testUpdatingConfigs() throws Exception {
+        TesseractOCRConfig configA = new TesseractOCRConfig();
+        configA.setLanguage("eng");
+        configA.setMinFileSizeToOcr(100);
+        configA.setOutputType(TesseractOCRConfig.OUTPUT_TYPE.TXT);
+        configA.addOtherTesseractConfig("k1", "a1");
+        configA.addOtherTesseractConfig("k2", "a2");
+
+        TesseractOCRConfig configB = new TesseractOCRConfig();
+        configB.setLanguage("fra");
+        configB.setMinFileSizeToOcr(1000);
+        configB.setOutputType(TesseractOCRConfig.OUTPUT_TYPE.HOCR);
+        configB.addOtherTesseractConfig("k1", "b1");
+        configB.addOtherTesseractConfig("k2", "b2");
+
+        TesseractOCRConfig clone = configA.cloneAndUpdate(configB);
+        assertEquals("fra", clone.getLanguage());
+        assertEquals(1000, clone.getMinFileSizeToOcr());
+        assertEquals(TesseractOCRConfig.OUTPUT_TYPE.HOCR,
+                clone.getOutputType());
+        assertEquals("b1", clone.getOtherTesseractConfig().get("k1"));
+        assertEquals("b2", clone.getOtherTesseractConfig().get("k2"));
+    }
 }
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 0124109..4c6ca30 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -18,13 +18,12 @@ package org.apache.tika.parser.ocr;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.external.ExternalParser;
 import org.apache.tika.parser.image.BPGParser;
 import org.apache.tika.parser.image.HeifParser;
 import org.apache.tika.parser.image.ICNSParser;
@@ -36,6 +35,7 @@ import org.apache.tika.parser.image.WebPParser;
 import org.junit.Assert;
 import org.junit.Test;
 
+import java.io.File;
 import java.io.InputStream;
 import java.util.HashSet;
 import java.util.Set;
@@ -49,26 +49,11 @@ import static org.junit.Assume.assumeTrue;
 
 public class TesseractOCRParserTest extends TikaTest {
 
-    public static boolean canRun() {
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest();
-        return tesseractOCRTest.canRun(config);
-    }
-
-    private boolean canRun(TesseractOCRConfig config) {
-        String[] checkCmd = {config.getTesseractPath() + TesseractOCRParser.getTesseractProg()};
-        // If Tesseract is not on the path, do not run the test.
-        return ExternalParser.check(checkCmd);
+    public static boolean canRun() throws TikaConfigException {
+        TesseractOCRParser p = new TesseractOCRParser();
+        return p.hasTesseract();
     }
 
-    @Test
-    public void testImageMagick() throws Exception {
-        //TODO -- figure out what the original intention was for this test or remove it.
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        assumeTrue(TesseractOCRParser.IMAGE_PREPROCESSOR.hasImageMagick(config));
-        String[] CheckCmd = {config.getImageMagickPath() + TesseractOCRParser.IMAGE_PREPROCESSOR.getImageMagickProg()};
-        assertTrue(ExternalParser.check(CheckCmd));
-    }
 
     @Test
     public void testInterwordSpacing() throws Exception {
@@ -93,22 +78,7 @@ public class TesseractOCRParserTest extends TikaTest {
         assertTrue(m.find());
     }
 
-    @Test (expected = TikaException.class)
-    public void testBadLanguageCode() throws Exception {
-        assumeTrue("can run OCR", canRun());
 
-        TesseractOCRConfig tesseractOCRConfigconfig = new TesseractOCRConfig();
-        tesseractOCRConfigconfig.setLanguage("zzz");
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(TesseractOCRConfig.class, tesseractOCRConfigconfig);
-
-        //with preserve interwordspacing "on"
-        //allow some flexibility in case Tesseract is computing spaces
-        //somewhat differently in different versions/OS's, etc.
-        String xml = getXML("testOCR_spacing.png",
-                getMetadata(MediaType.image("png")),
-                parseContext).xml;
-    }
 
     private Metadata getMetadata(MediaType mediaType) {
         Metadata metadata = new Metadata();
@@ -138,14 +108,27 @@ public class TesseractOCRParserTest extends TikaTest {
     }
 
     @Test
+    public void confirmRuntimeSkipOCR() throws Exception {
+        assumeTrue("can run OCR", canRun());
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        config.setSkipOcr(true);
+        ParseContext context = new ParseContext();
+        context.set(TesseractOCRConfig.class, config);
+        String xml = getXML("testTIFF_multipage.tif",
+                getMetadata(MediaType.image("tiff")), context).xml;
+        assertNotContained("Page 2", xml);
+    }
+
+    @Test
     public void testPositiveRotateOCR() throws Exception {
+        TesseractOCRParser p = new TesseractOCRParser();
+        assumeTrue(canRun());
+        assumeTrue(p.hasImageMagick());
         TesseractOCRConfig config = new TesseractOCRConfig();
-        assumeTrue(TesseractOCRParser.IMAGE_PREPROCESSOR.hasImageMagick(config));
         config.setApplyRotation(true);
         config.setResize(100);
         ParseContext parseContext = new ParseContext();
         parseContext.set(TesseractOCRConfig.class, config);
-        assumeTrue(canRun(config));
         Metadata metadata = getMetadata(MediaType.image("png"));
         String ocr = getText("testRotated+10.png", metadata, parseContext);
         assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
@@ -156,13 +139,14 @@ public class TesseractOCRParserTest extends TikaTest {
 
     @Test
     public void testNegativeRotateOCR() throws Exception {
+        TesseractOCRParser p = new TesseractOCRParser();
+        assumeTrue(p.hasImageMagick());
         TesseractOCRConfig config = new TesseractOCRConfig();
-        assumeTrue(TesseractOCRParser.IMAGE_PREPROCESSOR.hasImageMagick(config));
         config.setApplyRotation(true);
         config.setResize(100);
         ParseContext parseContext = new ParseContext();
         parseContext.set(TesseractOCRConfig.class, config);
-        assumeTrue(canRun(config));
+        assumeTrue(canRun());
         Metadata metadata = getMetadata(MediaType.image("png"));
         String ocr = getText("testRotated-10.png", metadata, parseContext);
         assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
@@ -173,14 +157,14 @@ public class TesseractOCRParserTest extends TikaTest {
 
     @Test
     public void testConfig() throws Exception {
-        try (InputStream is = getResourceAsStream("/org/apache/tika/config/TIKA-2705-tesseract.xml")) {
+        try (InputStream is = getResourceAsStream("/test-configs/TIKA-2705-tesseract.xml")) {
             TikaConfig config = new TikaConfig(is);
             Parser p = config.getParser();
             Parser tesseractOCRParser = findParser(p, org.apache.tika.parser.ocr.TesseractOCRParser.class);
             assertNotNull(tesseractOCRParser);
 
             TesseractOCRConfig tesseractOCRConfig = ((TesseractOCRParser)tesseractOCRParser).getDefaultConfig();
-            Assert.assertEquals(241, tesseractOCRConfig.getTimeout());
+            Assert.assertEquals(241, tesseractOCRConfig.getTimeoutSeconds());
             Assert.assertEquals(TesseractOCRConfig.OUTPUT_TYPE.HOCR, tesseractOCRConfig.getOutputType());
             Assert.assertEquals("ceb", tesseractOCRConfig.getLanguage());
             Assert.assertEquals(false, tesseractOCRConfig.isApplyRotation());
@@ -188,6 +172,23 @@ public class TesseractOCRParserTest extends TikaTest {
         }
     }
 
+    @Test
+    public void testArbitraryParams() throws Exception {
+        try (InputStream is = getResourceAsStream("/test-configs/tika-config-tesseract-arbitrary.xml")) {
+            TikaConfig config = new TikaConfig(is);
+            Parser p = config.getParser();
+            Parser tesseractOCRParser = findParser(p, org.apache.tika.parser.ocr.TesseractOCRParser.class);
+            assertNotNull(tesseractOCRParser);
+            TesseractOCRConfig tesseractOCRConfig = ((TesseractOCRParser)tesseractOCRParser).getDefaultConfig();
+            Assert.assertEquals("0.75",
+                    tesseractOCRConfig.getOtherTesseractConfig().get("textord_initialx_ile"));
+
+            Assert.assertEquals("0.15625",
+                    tesseractOCRConfig.getOtherTesseractConfig().get("textord_noise_hfract"));
+        }
+    }
+
+
     //to be used to figure out a) what image media types don't have ocr coverage and
     // b) what ocr media types don't have dedicated image parsers
     //this obv requires that tesseract be installed
@@ -236,4 +237,37 @@ public class TesseractOCRParserTest extends TikaTest {
         }
     }
 
+    @Test
+    public void testTrailingSlashInPathBehavior() {
+
+        TesseractOCRParser parser = new TesseractOCRParser();
+        parser.setTesseractPath("blah");
+        assertEquals("blah"+ File.separator, parser.getTesseractPath());
+        parser.setTesseractPath("blah"+File.separator);
+        assertEquals("blah"+File.separator, parser.getTesseractPath());
+        parser.setTesseractPath("");
+        assertEquals("", parser.getTesseractPath());
+
+        parser.setTessdataPath("blahdata");
+        assertEquals("blahdata"+File.separator, parser.getTessdataPath());
+        parser.setTessdataPath("blahdata"+File.separator);
+        assertEquals("blahdata"+File.separator, parser.getTessdataPath());
+        parser.setTessdataPath("");
+        assertEquals("", parser.getTessdataPath());
+
+        parser.setImageMagickPath("imagemagickpath");
+        assertEquals("imagemagickpath"+File.separator, parser.getImageMagickPath());
+        parser.setImageMagickPath("imagemagickpath"+File.separator);
+        assertEquals("imagemagickpath"+File.separator, parser.getImageMagickPath());
+        parser.setImageMagickPath("");
+        assertEquals("", parser.getImageMagickPath());
+    }
+
+    @Test
+    public void testBogusPathCheck() {
+        //allow path that doesn't actually exist
+        TesseractOCRParser parser = new TesseractOCRParser();
+        parser.setTesseractPath("blahdeblahblah");
+        assertEquals("blahdeblahblah"+File.separator, parser.getTesseractPath());
+    }
 }
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/StringsConfig-full.properties b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/StringsConfig-full.properties
similarity index 100%
rename from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/StringsConfig-full.properties
rename to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/StringsConfig-full.properties
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/StringsConfig-partial.properties b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/StringsConfig-partial.properties
similarity index 100%
rename from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/StringsConfig-partial.properties
rename to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/StringsConfig-partial.properties
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/TIKA-2705-tesseract.xml
similarity index 94%
copy from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/TIKA-2705-tesseract.xml
index b5543e4..c357408 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/TIKA-2705-tesseract.xml
@@ -23,7 +23,6 @@
     <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
       <params>
         <param name="timeout" type="int">241</param>
-<!--        <param name="tesseractPath" type="string">/myspecial/tess</param> -->
         <param name="outputType" type="string">hocr</param>
         <param name="applyRotation" type="bool">false</param>
         <param name="language" type="string">ceb</param>
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-arbitrary.xml
similarity index 68%
copy from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-arbitrary.xml
index b5543e4..78a935b 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-arbitrary.xml
@@ -17,16 +17,13 @@
 -->
 <properties>
   <parsers>
-    <parser class="org.apache.tika.parser.DefaultParser">
-      <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
-    </parser>
     <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
       <params>
-        <param name="timeout" type="int">241</param>
-<!--        <param name="tesseractPath" type="string">/myspecial/tess</param> -->
-        <param name="outputType" type="string">hocr</param>
-        <param name="applyRotation" type="bool">false</param>
-        <param name="language" type="string">ceb</param>
+        <!-- space delimited key-value pairs -->
+        <param name="otherTesseractSettings" type="list">
+          <string>textord_initialx_ile 0.75</string>
+          <string>textord_noise_hfract 0.15625</string>
+        </param>
       </params>
     </parser>
   </parsers>
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-full.xml
similarity index 63%
copy from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
copy to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-full.xml
index b5543e4..374427b 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-full.xml
@@ -17,16 +17,24 @@
 -->
 <properties>
   <parsers>
-    <parser class="org.apache.tika.parser.DefaultParser">
-      <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
-    </parser>
     <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
       <params>
-        <param name="timeout" type="int">241</param>
-<!--        <param name="tesseractPath" type="string">/myspecial/tess</param> -->
+        <param name="timeout" type="int">240</param>
+        <param name="density" type="int">200</param>
+        <param name="depth" type="int">8</param>
+        <param name="pageSegMode" type="string">2</param>
+        <param name="resize" type="int">300</param>
+        <param name="minFileSizeToOcr" type="long">1</param>
+        <param name="maxFileSizeToOcr" type="long">2000000</param>
+        <param name="timeoutSeconds" type="int">240</param>
+
         <param name="outputType" type="string">hocr</param>
-        <param name="applyRotation" type="bool">false</param>
+        <param name="filter" type="string">box</param>
+        <param name="applyRotation" type="bool">true</param>
+        <param name="enableImageProcessing" type="bool">false</param>
+
         <param name="language" type="string">ceb</param>
+
       </params>
     </parser>
   </parsers>
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-partial.xml
similarity index 70%
rename from tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
rename to tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-partial.xml
index b5543e4..33ca5c1 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-configs/tika-config-tesseract-partial.xml
@@ -17,16 +17,21 @@
 -->
 <properties>
   <parsers>
-    <parser class="org.apache.tika.parser.DefaultParser">
-      <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
-    </parser>
     <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
       <params>
-        <param name="timeout" type="int">241</param>
-<!--        <param name="tesseractPath" type="string">/myspecial/tess</param> -->
+        <param name="timeout" type="int">240</param>
+        <param name="density" type="int">200</param>
+        <param name="depth" type="int">8</param>
+        <param name="resize" type="int">300</param>
+
+        <param name="minFileSizeToOcr" type="long">1</param>
         <param name="outputType" type="string">hocr</param>
+        <param name="filter" type="string">box</param>
         <param name="applyRotation" type="bool">false</param>
-        <param name="language" type="string">ceb</param>
+        <param name="enableImageProcessing" type="bool">false</param>
+
+        <param name="language" type="string">fra+deu</param>
+
       </params>
     </parser>
   </parsers>
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-full.properties b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-full.properties
deleted file mode 100644
index 8161abf..0000000
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-full.properties
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#tesseractPath=/opt/tesseract
-#tessdataPath=/usr/local/share
-language=eng
-pageSegMode=2
-maxFileSizeToOcr=2000000
-timeout=240
-minFileSizeToOcr=1
-
-ImageMagickPath=/usr/local/bin
-density=200
-depth=8
-filter=box
-resize=300
-applyRotation=true
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-partial.properties b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-partial.properties
deleted file mode 100644
index 31a800d..0000000
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/resources/test-properties/TesseractOCRConfig-partial.properties
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-language=fra+deu
-timeout=240
-minFileSizeToOcr=1
-
-enableImageProcessing=false
-density=200
-depth=8
-filter=box
-resize=300
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index c53cb86..e719e7a 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -23,6 +23,7 @@ import static org.junit.Assume.assumeTrue;
 import java.util.List;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
@@ -38,20 +39,16 @@ import org.junit.Test;
 
 public class TesseractOCRParserTest extends TikaTest {
 
-    public static boolean canRun() {
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest();
-        return tesseractOCRTest.canRun(config);
+    public static boolean canRun() throws TikaConfigException {
+        TesseractOCRParser p = new TesseractOCRParser();
+        return p.hasTesseract();
     }
 
-    private boolean canRun(TesseractOCRConfig config) {
-        String[] checkCmd = {config.getTesseractPath() + TesseractOCRParser.getTesseractProg()};
-        // If Tesseract is not on the path, do not run the test.
-        return ExternalParser.check(checkCmd);
-    }
+
 
     /*
-    Check that if Tesseract is not found, the TesseractOCRParser claims to not support
+    Check that if Tesseract is told to skip OCR,
+    the TesseractOCRParser claims to not support
     any file types. So, the standard image parser is called instead.
      */
     @Test
@@ -61,11 +58,11 @@ public class TesseractOCRParserTest extends TikaTest {
         MediaType png = MediaType.image("png");
 
         // With an invalid path, will offer no types
-        TesseractOCRConfig invalidConfig = new TesseractOCRConfig();
-        invalidConfig.setTesseractPath("/made/up/path");
+        TesseractOCRConfig skipOcrConfig = new TesseractOCRConfig();
+        skipOcrConfig.setSkipOcr(true);
 
         ParseContext parseContext = new ParseContext();
-        parseContext.set(TesseractOCRConfig.class, invalidConfig);
+        parseContext.set(TesseractOCRConfig.class, skipOcrConfig);
 
         // No types offered
         Assert.assertEquals(0, parser.getSupportedTypes(parseContext).size());
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 56721eb..ee133d0 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -20,6 +20,7 @@ import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.extractor.ContainerExtractor;
 import org.apache.tika.extractor.ParserContainerExtractor;
 import org.apache.tika.io.TikaInputStream;
@@ -54,11 +55,11 @@ public class PDFParserTest extends TikaTest {
 
     private static Boolean hasTesseract = null;
 
-    public static boolean canRunOCR() {
+    public static boolean canRunOCR() throws TikaConfigException {
         if (hasTesseract != null) {
             return hasTesseract;
         }
-        hasTesseract = new TesseractOCRParser().hasTesseract(new TesseractOCRConfig());
+        hasTesseract = new TesseractOCRParser().hasTesseract();
         return hasTesseract;
     }
 
@@ -386,6 +387,7 @@ public class PDFParserTest extends TikaTest {
             //now override the max file size to ocr, and you should get text
             ParseContext pc = new ParseContext();
             TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig();
+            tesseractOCRConfig.setMaxFileSizeToOcr(10000000);
             pc.set(TesseractOCRConfig.class, tesseractOCRConfig);
             text = getText(getResourceAsStream("/test-documents/testOCR.pdf"), p, pc);
             assertContains("Happy", text);