You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/05/18 10:54:51 UTC

[tika] branch master updated: TIKA-2359: Alert user that tesseract is available and will be used.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/master by this push:
       new  ebc87ae   TIKA-2359: Alert user that tesseract is available and will be used.
ebc87ae is described below

commit ebc87aec539eef752072e95315daee65f7f42ebb
Author: tballison <ta...@mitre.org>
AuthorDate: Thu May 18 06:54:41 2017 -0400

    TIKA-2359: Alert user that tesseract is available and will be used.
---
 .../org/apache/tika/parser/ocr/TesseractOCRParser.java     | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 9728b38..121e096 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -101,16 +101,24 @@ public class TesseractOCRParser extends AbstractParser {
                     MediaType.image("jpx"), MediaType.image("x-portable-pixmap")
             })));
     private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<>();
-
+    private static volatile boolean HAS_ALERTED = false;
 
 
     @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         // If Tesseract is installed, offer our supported image types
         TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
-        if (hasTesseract(config))
+        if (hasTesseract(config)) {
+            if (! HAS_ALERTED) {
+                LOG.info("Tesseract OCR is installed and will be automatically applied to image files.\n"+
+                        "This may dramatically slow down content extraction (TIKA-2359).\n"+
+                        "As of Tika 1.15 (and prior versions), Tesseract is automatically called.\n"+
+                        "In future versions of Tika, users may need to turn the TesseractOCRParser on via TikaConfig."
+                );
+                HAS_ALERTED = true;
+            }
             return SUPPORTED_TYPES;
-
+        }
         // Otherwise don't advertise anything, so the other image parsers
         //  can be selected instead
         return Collections.emptySet();

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].