You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2015/01/07 18:29:01 UTC
svn commit: r1650121 -
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
Author: tpalsulich
Date: Wed Jan 7 17:29:01 2015
New Revision: 1650121
URL: http://svn.apache.org/r1650121
Log:
Fix indenting in TesseractOCRParser.
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1650121&r1=1650120&r2=1650121&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Wed Jan 7 17:29:01 2015
@@ -71,267 +71,267 @@ import org.xml.sax.SAXException;
* config.setTesseractPath(tesseractFolder);<br>
* parseContext.set(TesseractOCRConfig.class, config);<br>
* </p>
- *
- *
+ *
+ *
*/
public class TesseractOCRParser extends AbstractParser {
- private static final long serialVersionUID = -8167538283213097265L;
- private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig();
- private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
- new HashSet<MediaType>(Arrays.asList(new MediaType[] {
- MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"),
- MediaType.image("x-ms-bmp"), MediaType.image("gif")
- })));
- private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>();
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- // If Tesseract is installed, offer our supported image types
- TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
- if (hasTesseract(config))
- return SUPPORTED_TYPES;
-
- // Otherwise don't advertise anything, so the other image parsers
- // can be selected instead
- return Collections.emptySet();
- }
-
- private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
- if (!config.getTesseractPath().isEmpty()) {
- Map<String, String> env = pb.environment();
- env.put("TESSDATA_PREFIX", config.getTesseractPath());
+ private static final long serialVersionUID = -8167538283213097265L;
+ private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig();
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
+ new HashSet<MediaType>(Arrays.asList(new MediaType[] {
+ MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"),
+ MediaType.image("x-ms-bmp"), MediaType.image("gif")
+ })));
+ private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>();
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ // If Tesseract is installed, offer our supported image types
+ TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+ if (hasTesseract(config))
+ return SUPPORTED_TYPES;
+
+ // Otherwise don't advertise anything, so the other image parsers
+ // can be selected instead
+ return Collections.emptySet();
}
- }
-
- private boolean hasTesseract(TesseractOCRConfig config) {
- // Fetch where the config says to find Tesseract
- String tesseract = config.getTesseractPath() + getTesseractProg();
-
- // Have we already checked for a copy of Tesseract there?
- if (TESSERACT_PRESENT.containsKey(tesseract)) {
- return TESSERACT_PRESENT.get(tesseract);
- }
-
- // Try running Tesseract from there, and see if it exists + works
- String[] checkCmd = { tesseract };
- try {
- boolean hasTesseract = ExternalParser.check(checkCmd);
- TESSERACT_PRESENT.put(tesseract, hasTesseract);
- return hasTesseract;
- } catch (NoClassDefFoundError e) {
- // This happens under OSGi + Fork Parser - see TIKA-1507
- // As a workaround for now, just say we can't use OCR
- // TODO Resolve it so we don't need this try/catch block
- TESSERACT_PRESENT.put(tesseract, false);
- return false;
- }
- }
-
- public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
- SAXException, TikaException {
-
- TemporaryResources tmp = new TemporaryResources();
- FileOutputStream fos = null;
- TikaInputStream tis = null;
- try {
- int w = image.getWidth(null);
- int h = image.getHeight(null);
- BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
- Graphics2D g2 = bImage.createGraphics();
- g2.drawImage(image, 0, 0, null);
- g2.dispose();
- File file = tmp.createTemporaryFile();
- fos = new FileOutputStream(file);
- ImageIO.write(bImage, "png", fos);
- bImage = null;
- tis = TikaInputStream.get(file);
- parse(tis, handler, metadata, context);
-
- } finally {
- tmp.dispose();
- if (tis != null)
- tis.close();
- if (fos != null)
- fos.close();
+
+ private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
+ if (!config.getTesseractPath().isEmpty()) {
+ Map<String, String> env = pb.environment();
+ env.put("TESSDATA_PREFIX", config.getTesseractPath());
+ }
}
- }
+ private boolean hasTesseract(TesseractOCRConfig config) {
+ // Fetch where the config says to find Tesseract
+ String tesseract = config.getTesseractPath() + getTesseractProg();
+
+ // Have we already checked for a copy of Tesseract there?
+ if (TESSERACT_PRESENT.containsKey(tesseract)) {
+ return TESSERACT_PRESENT.get(tesseract);
+ }
- @Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
-
- // If Tesseract is not on the path with the current config, do not try to run OCR
- // getSupportedTypes shouldn't have listed us as handling it, so this should only
- // occur if someone directly calls this parser, not via DefaultParser or similar
- if (! hasTesseract(config))
- return;
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-
- TemporaryResources tmp = new TemporaryResources();
- File output = null;
- try {
- TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
- File input = tikaStream.getFile();
- long size = tikaStream.getLength();
-
- if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
-
- output = tmp.createTemporaryFile();
- doOCR(input, output, config);
-
- // Tesseract appends .txt to output file name
- output = new File(output.getAbsolutePath() + ".txt");
-
- if (output.exists())
- extractOutput(new FileInputStream(output), xhtml);
-
- }
-
- // Temporary workaround for TIKA-1445 - until we can specify
- // composite parsers with strategies (eg Composite, Try In Turn),
- // always send the image onwards to the regular parser to have
- // the metadata for them extracted as well
- _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
- } finally {
- tmp.dispose();
- if (output != null) {
- output.delete();
- }
+ // Try running Tesseract from there, and see if it exists + works
+ String[] checkCmd = { tesseract };
+ try {
+ boolean hasTesseract = ExternalParser.check(checkCmd);
+ TESSERACT_PRESENT.put(tesseract, hasTesseract);
+ return hasTesseract;
+ } catch (NoClassDefFoundError e) {
+ // This happens under OSGi + Fork Parser - see TIKA-1507
+ // As a workaround for now, just say we can't use OCR
+ // TODO Resolve it so we don't need this try/catch block
+ TESSERACT_PRESENT.put(tesseract, false);
+ return false;
+ }
}
- }
- // TIKA-1445 workaround parser
- private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
- private static class CompositeImageParser extends CompositeParser {
- private static final long serialVersionUID = -2398203346206381382L;
- private static List<Parser> imageParsers = Arrays.asList(new Parser[]{
- new ImageParser(), new JpegParser(), new TiffParser()
- });
- CompositeImageParser() {
- super(new MediaTypeRegistry(), imageParsers);
- }
- }
-
- /**
- * Run external tesseract-ocr process.
- *
- * @param input
- * File to be ocred
- * @param output
- * File to collect ocr result
- * @param config
- * Configuration of tesseract-ocr engine
- * @throws TikaException
- * if the extraction timed out
- * @throws IOException
- * if an input error occurred
- */
- private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
- String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
- config.getLanguage(), "-psm", config.getPageSegMode() };
-
- ProcessBuilder pb = new ProcessBuilder(cmd);
- setEnv(config, pb);
- final Process process = pb.start();
-
- process.getOutputStream().close();
- InputStream out = process.getInputStream();
- InputStream err = process.getErrorStream();
-
- logStream("OCR MSG", out, input);
- logStream("OCR ERROR", err, input);
-
- FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
- public Integer call() throws Exception {
- return process.waitFor();
- }
- });
-
- Thread waitThread = new Thread(waitTask);
- waitThread.start();
-
- try {
- waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
-
- } catch (InterruptedException e) {
- waitThread.interrupt();
- process.destroy();
- Thread.currentThread().interrupt();
- throw new TikaException("TesseractOCRParser interrupted", e);
-
- } catch (ExecutionException e) {
- // should not be thrown
-
- } catch (TimeoutException e) {
- waitThread.interrupt();
- process.destroy();
- throw new TikaException("TesseractOCRParser timeout", e);
+
+ public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ TemporaryResources tmp = new TemporaryResources();
+ FileOutputStream fos = null;
+ TikaInputStream tis = null;
+ try {
+ int w = image.getWidth(null);
+ int h = image.getHeight(null);
+ BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
+ Graphics2D g2 = bImage.createGraphics();
+ g2.drawImage(image, 0, 0, null);
+ g2.dispose();
+ File file = tmp.createTemporaryFile();
+ fos = new FileOutputStream(file);
+ ImageIO.write(bImage, "png", fos);
+ bImage = null;
+ tis = TikaInputStream.get(file);
+ parse(tis, handler, metadata, context);
+
+ } finally {
+ tmp.dispose();
+ if (tis != null)
+ tis.close();
+ if (fos != null)
+ fos.close();
+ }
+
}
- }
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+
+ // If Tesseract is not on the path with the current config, do not try to run OCR
+ // getSupportedTypes shouldn't have listed us as handling it, so this should only
+ // occur if someone directly calls this parser, not via DefaultParser or similar
+ if (! hasTesseract(config))
+ return;
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+ TemporaryResources tmp = new TemporaryResources();
+ File output = null;
+ try {
+ TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+ File input = tikaStream.getFile();
+ long size = tikaStream.getLength();
+
+ if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
+
+ output = tmp.createTemporaryFile();
+ doOCR(input, output, config);
+
+ // Tesseract appends .txt to output file name
+ output = new File(output.getAbsolutePath() + ".txt");
+
+ if (output.exists())
+ extractOutput(new FileInputStream(output), xhtml);
- /**
- * Reads the contents of the given stream and write it to the given XHTML
- * content handler. The stream is closed once fully processed.
- *
- * @param stream
- * Stream where is the result of ocr
- * @param xhtml
- * XHTML content handler
- * @throws SAXException
- * if the XHTML SAX events could not be handled
- * @throws IOException
- * if an input error occurred
- */
- private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
-
- Reader reader = new InputStreamReader(stream, "UTF-8");
- xhtml.startDocument();
- xhtml.startElement("div");
- try {
- char[] buffer = new char[1024];
- for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
- if (n > 0)
- xhtml.characters(buffer, 0, n);
- }
- } finally {
- reader.close();
+ }
+
+ // Temporary workaround for TIKA-1445 - until we can specify
+ // composite parsers with strategies (eg Composite, Try In Turn),
+ // always send the image onwards to the regular parser to have
+ // the metadata for them extracted as well
+ _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
+ } finally {
+ tmp.dispose();
+ if (output != null) {
+ output.delete();
+ }
+ }
+ }
+ // TIKA-1445 workaround parser
+ private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
+ private static class CompositeImageParser extends CompositeParser {
+ private static final long serialVersionUID = -2398203346206381382L;
+ private static List<Parser> imageParsers = Arrays.asList(new Parser[]{
+ new ImageParser(), new JpegParser(), new TiffParser()
+ });
+ CompositeImageParser() {
+ super(new MediaTypeRegistry(), imageParsers);
+ }
}
- xhtml.endElement("div");
- xhtml.endDocument();
- }
-
- /**
- * Starts a thread that reads the contents of the standard output or error
- * stream of the given process to not block the process. The stream is closed
- * once fully processed.
- */
- private void logStream(final String logType, final InputStream stream, final File file) {
- new Thread() {
- public void run() {
- Reader reader = new InputStreamReader(stream);
- StringBuilder out = new StringBuilder();
- char[] buffer = new char[1024];
+
+ /**
+ * Run external tesseract-ocr process.
+ *
+ * @param input
+ * File to be ocred
+ * @param output
+ * File to collect ocr result
+ * @param config
+ * Configuration of tesseract-ocr engine
+ * @throws TikaException
+ * if the extraction timed out
+ * @throws IOException
+ * if an input error occurred
+ */
+ private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
+ String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
+ config.getLanguage(), "-psm", config.getPageSegMode() };
+
+ ProcessBuilder pb = new ProcessBuilder(cmd);
+ setEnv(config, pb);
+ final Process process = pb.start();
+
+ process.getOutputStream().close();
+ InputStream out = process.getInputStream();
+ InputStream err = process.getErrorStream();
+
+ logStream("OCR MSG", out, input);
+ logStream("OCR ERROR", err, input);
+
+ FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
+ public Integer call() throws Exception {
+ return process.waitFor();
+ }
+ });
+
+ Thread waitThread = new Thread(waitTask);
+ waitThread.start();
+
try {
- for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
- out.append(buffer, 0, n);
- } catch (IOException e) {
+ waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+
+ } catch (InterruptedException e) {
+ waitThread.interrupt();
+ process.destroy();
+ Thread.currentThread().interrupt();
+ throw new TikaException("TesseractOCRParser interrupted", e);
+
+ } catch (ExecutionException e) {
+ // should not be thrown
+
+ } catch (TimeoutException e) {
+ waitThread.interrupt();
+ process.destroy();
+ throw new TikaException("TesseractOCRParser timeout", e);
+ }
+
+ }
+ /**
+ * Reads the contents of the given stream and write it to the given XHTML
+ * content handler. The stream is closed once fully processed.
+ *
+ * @param stream
+ * Stream where is the result of ocr
+ * @param xhtml
+ * XHTML content handler
+ * @throws SAXException
+ * if the XHTML SAX events could not be handled
+ * @throws IOException
+ * if an input error occurred
+ */
+ private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
+
+ Reader reader = new InputStreamReader(stream, "UTF-8");
+ xhtml.startDocument();
+ xhtml.startElement("div");
+ try {
+ char[] buffer = new char[1024];
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+ if (n > 0)
+ xhtml.characters(buffer, 0, n);
+ }
} finally {
- IOUtils.closeQuietly(stream);
+ reader.close();
}
+ xhtml.endElement("div");
+ xhtml.endDocument();
+ }
- String msg = out.toString();
- // log or discard message?
+ /**
+ * Starts a thread that reads the contents of the standard output or error
+ * stream of the given process to not block the process. The stream is closed
+ * once fully processed.
+ */
+ private void logStream(final String logType, final InputStream stream, final File file) {
+ new Thread() {
+ public void run() {
+ Reader reader = new InputStreamReader(stream);
+ StringBuilder out = new StringBuilder();
+ char[] buffer = new char[1024];
+ try {
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
+ out.append(buffer, 0, n);
+ } catch (IOException e) {
+
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
- }
- }.start();
- }
-
- static String getTesseractProg() {
- return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
- }
+ String msg = out.toString();
+ // log or discard message?
+
+ }
+ }.start();
+ }
+
+ static String getTesseractProg() {
+ return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
+ }
}