You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/01/07 12:59:49 UTC

svn commit: r1650044 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java

Author: nick
Date: Wed Jan  7 11:59:48 2015
New Revision: 1650044

URL: http://svn.apache.org/r1650044
Log:
TIKA-1445 If Tesseract isn't available, don't offer any supported mime types, so the parser avoids being picked by DefaultParser or similar

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1650044&r1=1650043&r2=1650044&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Wed Jan  7 11:59:48 2015
@@ -26,6 +26,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
@@ -86,8 +87,15 @@ public class TesseractOCRParser extends
   }
 
   @Override
-  public Set<MediaType> getSupportedTypes(ParseContext arg0) {
-    return SUPPORTED_TYPES;
+  public Set<MediaType> getSupportedTypes(ParseContext context) {
+      // If Tesseract is installed, offer our supported image types
+      TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+      if (hasTesseract(config))
+          return SUPPORTED_TYPES;
+      
+      // Otherwise don't advertise anything, so the other image parsers
+      //  can be selected instead
+      return Collections.emptySet();
   }
 
   private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
@@ -96,6 +104,11 @@ public class TesseractOCRParser extends
       env.put("TESSDATA_PREFIX", config.getTesseractPath());
     }
   }
+  
+  private boolean hasTesseract(TesseractOCRConfig config) {
+      String[] checkCmd = { config.getTesseractPath() + getTesseractProg() };
+      return ExternalParser.check(checkCmd);
+  }
 
   public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
       SAXException, TikaException {
@@ -130,12 +143,12 @@ public class TesseractOCRParser extends
   @Override
   public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
       throws IOException, SAXException, TikaException {
-
     TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
 
-    String[] checkCmd = { config.getTesseractPath() + getTesseractProg() };
-    // If Tesseract is not on the path, do not try to run OCR.
-    if (!ExternalParser.check(checkCmd))
+    // If Tesseract is not on the path with the current config, do not try to run OCR
+    // getSupportedTypes shouldn't have listed us as handling it, so this should only
+    //  occur if someone directly calls this parser, not via DefaultParser or similar
+    if (! hasTesseract(config))
       return;
 
     XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);