You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/07 17:54:36 UTC

[tika] branch branch_1x updated: TIKA-2705 -- allow parameter configuration for tesseract via tika-config.xml

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 375e3d7  TIKA-2705 -- allow parameter configuration for tesseract via tika-config.xml
375e3d7 is described below

commit 375e3d76d896656cbb5dd8c6eff6176f729336fb
Author: TALLISON <ta...@apache.org>
AuthorDate: Tue Aug 7 13:53:43 2018 -0400

    TIKA-2705 -- allow parameter configuration for tesseract via tika-config.xml
---
 .../apache/tika/parser/ocr/TesseractOCRConfig.java |  38 ++++-
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 161 ++++++++++++++++-----
 .../tika/parser/ocr/TesseractOCRParserTest.java    |  33 +++++
 .../org/apache/tika/config/TIKA-2705-tesseract.xml |  33 +++++
 4 files changed, 219 insertions(+), 46 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index c6ff21c..1c65ece 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -71,10 +71,10 @@ public class TesseractOCRConfig implements Serializable {
     private String pageSegMode = "1";
 
     // Minimum file size to submit file to ocr.
-    private int minFileSizeToOcr = 0;
+    private long minFileSizeToOcr = 0;
 
     // Maximum file size to submit file to ocr.
-    private int maxFileSizeToOcr = Integer.MAX_VALUE;
+    private long maxFileSizeToOcr = Integer.MAX_VALUE;
 
     // Maximum time (seconds) to wait for the ocring process termination
     private int timeout = 120;
@@ -322,9 +322,9 @@ public class TesseractOCRConfig implements Serializable {
         return preserveInterwordSpacing;
     }
     /**
-     * @see #setMinFileSizeToOcr(int minFileSizeToOcr)
+     * @see #setMinFileSizeToOcr(long minFileSizeToOcr)
      */
-    public int getMinFileSizeToOcr() {
+    public long getMinFileSizeToOcr() {
         return minFileSizeToOcr;
     }
 
@@ -332,14 +332,14 @@ public class TesseractOCRConfig implements Serializable {
      * Set minimum file size to submit file to ocr.
      * Default is 0.
      */
-    public void setMinFileSizeToOcr(int minFileSizeToOcr) {
+    public void setMinFileSizeToOcr(long minFileSizeToOcr) {
         this.minFileSizeToOcr = minFileSizeToOcr;
     }
 
     /**
-     * @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)
+     * @see #setMaxFileSizeToOcr(long maxFileSizeToOcr)
      */
-    public int getMaxFileSizeToOcr() {
+    public long getMaxFileSizeToOcr() {
         return maxFileSizeToOcr;
     }
 
@@ -347,7 +347,7 @@ public class TesseractOCRConfig implements Serializable {
      * Set maximum file size to submit file to ocr.
      * Default is Integer.MAX_VALUE.
      */
-    public void setMaxFileSizeToOcr(int maxFileSizeToOcr) {
+    public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
         this.maxFileSizeToOcr = maxFileSizeToOcr;
     }
 
@@ -630,6 +630,28 @@ public class TesseractOCRConfig implements Serializable {
      * @param defaultMissing default parameter to use.
      * @return the value.
      */
+    private long getProp(Properties properties, String property, long defaultMissing) {
+        String p = properties.getProperty(property);
+        if (p == null || p.isEmpty()) {
+            return defaultMissing;
+        }
+        try {
+            return Integer.parseInt(p);
+        } catch (Throwable ex) {
+            throw new RuntimeException(String.format(Locale.ROOT, "Cannot parse TesseractOCRConfig variable %s, invalid integer value",
+                    property), ex);
+        }
+    }
+
+
+    /**
+     * Get property from the properties file passed in.
+     *
+     * @param properties     properties file to read from.
+     * @param property       the property to fetch.
+     * @param defaultMissing default parameter to use.
+     * @return the value.
+     */
     private String getProp(Properties properties, String property, String defaultMissing) {
         return properties.getProperty(property, defaultMissing);
     }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index d3238c2..df46f00 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -16,46 +16,13 @@
  */
 package org.apache.tika.parser.ocr;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-import javax.imageio.ImageIO;
-import javax.xml.parsers.SAXParser;
-import java.awt.Image;
-import java.awt.image.BufferedImage;
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.io.Reader;
-import java.nio.charset.Charset;
-import java.nio.file.Files;
-import java.nio.file.Paths;
-import java.nio.file.StandardCopyOption;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.FutureTask;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
-
 import org.apache.commons.exec.CommandLine;
 import org.apache.commons.exec.DefaultExecutor;
 import org.apache.commons.exec.PumpStreamHandler;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang.SystemUtils;
+import org.apache.tika.config.Field;
 import org.apache.tika.config.Initializable;
 import org.apache.tika.config.InitializableProblemHandler;
 import org.apache.tika.config.Param;
@@ -84,6 +51,39 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
+import javax.imageio.ImageIO;
+import java.awt.Image;
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.FutureTask;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 /**
  * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
  * create a {@link TesseractOCRConfig} object and pass it through a
@@ -106,13 +106,14 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
 
 
     private static final long serialVersionUID = -8167538283213097265L;
-    private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig();
     private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
             new HashSet<>(Arrays.asList(new MediaType[]{
                     MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"),
                     MediaType.image("bmp"), MediaType.image("gif"), MediaType.image("jp2"),
                     MediaType.image("jpx"), MediaType.image("x-portable-pixmap")
             })));
+    private final TesseractOCRConfig defaultConfig = new TesseractOCRConfig();
+
     private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<>();
     private static Map<String,Boolean> IMAGE_MAGICK_PRESENT = new HashMap<>();
 
@@ -120,7 +121,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
     @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         // If Tesseract is installed, offer our supported image types
-        TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+        TesseractOCRConfig config = context.get(TesseractOCRConfig.class, defaultConfig);
         if (hasTesseract(config)) {
             return SUPPORTED_TYPES;
         }
@@ -258,7 +259,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
     @Override
     public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext)
             throws IOException, SAXException, TikaException {
-        TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+        TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, defaultConfig);
         // If Tesseract is not on the path with the current config, do not try to run OCR
         // getSupportedTypes shouldn't have listed us as handling it, so this should only
         //  occur if someone directly calls this parser, not via DefaultParser or similar
@@ -471,7 +472,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
         //by sending in a bogus tesseract path via a custom TesseractOCRConfig.
         //TODO: figure out how to solve that.
         if (! hasWarned()) {
-            if (hasTesseract(DEFAULT_CONFIG)) {
+            if (hasTesseract(defaultConfig)) {
                 problemHandler.handleInitializableProblem(this.getClass().getName(),
                         "Tesseract OCR is installed and will be automatically applied to image files unless\n" +
                                 "you've excluded the TesseractOCRParser from the default parser.\n"+
@@ -696,5 +697,89 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
     protected void warn() {
         HAS_WARNED = true;
     }
+
+    @Field
+    public void setTesseractPath(String tesseractPath) {
+        defaultConfig.setTesseractPath(tesseractPath);
+    }
+
+    @Field
+    public void setTessdataPath(String tessdataPath) {
+        defaultConfig.setTessdataPath(tessdataPath);
+    }
+
+    @Field
+    public void setLanguage(String language) {
+        defaultConfig.setLanguage(language);
+    }
+
+    @Field
+    public void setPageSegMode(String pageSegMode) {
+        defaultConfig.setPageSegMode(pageSegMode);
+    }
+
+    @Field
+    public void setMinFileSizeToOcr(long minFileSizeToOcr) {
+        defaultConfig.setMinFileSizeToOcr(minFileSizeToOcr);
+    }
+
+    @Field
+    public void setTimeout(int timeout) {
+        defaultConfig.setTimeout(timeout);
+    }
+
+    @Field
+    public void setOutputType(String outputType) {
+        defaultConfig.setOutputType(outputType);
+    }
+
+    @Field
+    public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing) {
+        defaultConfig.setPreserveInterwordSpacing(preserveInterwordSpacing);
+    }
+
+    @Field
+    public void setEnableImageProcessing(int enableImageProcessing) {
+        defaultConfig.setEnableImageProcessing(enableImageProcessing);
+    }
+
+    @Field
+    public void setImageMagickPath(String imageMagickPath) {
+        defaultConfig.setImageMagickPath(imageMagickPath);
+    }
+
+    @Field
+    public void setDensity(int density) {
+        defaultConfig.setDensity(density);
+    }
+
+    @Field
+    public void setDepth(int depth) {
+        defaultConfig.setDepth(depth);
+    }
+
+    @Field
+    public void setColorspace(String colorspace) {
+        defaultConfig.setColorspace(colorspace);
+    }
+
+    @Field
+    public void setFilter(String filter) {
+        defaultConfig.setFilter(filter);
+    }
+
+    @Field
+    public void setResize(int resize) {
+        defaultConfig.setResize(resize);
+    }
+
+    @Field
+    public void setApplyRotation(boolean applyRotation) {
+        defaultConfig.setApplyRotation(applyRotation);
+    }
+
+    public TesseractOCRConfig getDefaultConfig() {
+        return defaultConfig;
+    }
 }
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 039adf7..aea2c96 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.ocr;
 
 import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assume.assumeTrue;
 
@@ -26,10 +27,13 @@ import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.DefaultParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -303,4 +307,33 @@ public class TesseractOCRParserTest extends TikaTest {
             assertContains("Its had resolving otherwise she contented therefore", ocr);
         }
     }
+
+    @Test
+    public void testConfig() throws Exception {
+        TikaConfig config = new TikaConfig(getResourceAsStream("/org/apache/tika/config/TIKA-2705-tesseract.xml"));
+        Parser p = config.getParser();
+        Parser tesseractOCRParser = findParser(p, org.apache.tika.parser.ocr.TesseractOCRParser.class);
+        assertNotNull(tesseractOCRParser);
+
+        TesseractOCRConfig tesseractOCRConfig = ((TesseractOCRParser)tesseractOCRParser).getDefaultConfig();
+        assertEquals(241, tesseractOCRConfig.getTimeout());
+        assertEquals(TesseractOCRConfig.OUTPUT_TYPE.HOCR, tesseractOCRConfig.getOutputType());
+        assertEquals("ceb", tesseractOCRConfig.getLanguage());
+        assertEquals(false, tesseractOCRConfig.getApplyRotation());
+        assertContains("myspecial", tesseractOCRConfig.getTesseractPath());
+    }
+
+    private Parser findParser(Parser parser, Class clazz) {
+        if (parser instanceof CompositeParser) {
+            for (Parser child : ((CompositeParser)parser).getAllComponentParsers()) {
+                Parser found = findParser(child, clazz);
+                if (found != null) {
+                    return found;
+                }
+            }
+        } else if (clazz.isInstance(parser)) {
+            return parser;
+        }
+        return null;
+    }
 }
diff --git a/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
new file mode 100644
index 0000000..c77d7e4
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-2705-tesseract.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
+    </parser>
+    <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
+      <params>
+        <param name="timeout" type="int">241</param>
+        <param name="tesseractPath" type="string">/myspecial/tess</param>
+        <param name="outputType" type="string">hocr</param>
+        <param name="applyRotation" type="bool">false</param>
+        <param name="language" type="string">ceb</param>
+      </params>
+    </parser>
+  </parsers>
+</properties>