You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/01/07 12:59:49 UTC
svn commit: r1650044 -
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
Author: nick
Date: Wed Jan 7 11:59:48 2015
New Revision: 1650044
URL: http://svn.apache.org/r1650044
Log:
TIKA-1445 If Tesseract isn't available, don't offer any supported mime types, so the parser avoids being picked by DefaultParser or similar
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1650044&r1=1650043&r2=1650044&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Wed Jan 7 11:59:48 2015
@@ -26,6 +26,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
+import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
@@ -86,8 +87,15 @@ public class TesseractOCRParser extends
}
@Override
- public Set<MediaType> getSupportedTypes(ParseContext arg0) {
- return SUPPORTED_TYPES;
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ // If Tesseract is installed, offer our supported image types
+ TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+ if (hasTesseract(config))
+ return SUPPORTED_TYPES;
+
+ // Otherwise don't advertise anything, so the other image parsers
+ // can be selected instead
+ return Collections.emptySet();
}
private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
@@ -96,6 +104,11 @@ public class TesseractOCRParser extends
env.put("TESSDATA_PREFIX", config.getTesseractPath());
}
}
+
+ private boolean hasTesseract(TesseractOCRConfig config) {
+ String[] checkCmd = { config.getTesseractPath() + getTesseractProg() };
+ return ExternalParser.check(checkCmd);
+ }
public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
@@ -130,12 +143,12 @@ public class TesseractOCRParser extends
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
-
TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
- String[] checkCmd = { config.getTesseractPath() + getTesseractProg() };
- // If Tesseract is not on the path, do not try to run OCR.
- if (!ExternalParser.check(checkCmd))
+ // If Tesseract is not on the path with the current config, do not try to run OCR
+ // getSupportedTypes shouldn't have listed us as handling it, so this should only
+ // occur if someone directly calls this parser, not via DefaultParser or similar
+ if (! hasTesseract(config))
return;
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);