You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2015/01/07 18:29:01 UTC

svn commit: r1650121 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java

Author: tpalsulich
Date: Wed Jan  7 17:29:01 2015
New Revision: 1650121

URL: http://svn.apache.org/r1650121
Log:
Fix indenting in TesseractOCRParser.

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1650121&r1=1650120&r2=1650121&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Wed Jan  7 17:29:01 2015
@@ -71,267 +71,267 @@ import org.xml.sax.SAXException;
  * config.setTesseractPath(tesseractFolder);<br>
  * parseContext.set(TesseractOCRConfig.class, config);<br>
  * </p>
- * 
- * 
+ *
+ *
  */
 public class TesseractOCRParser extends AbstractParser {
-  private static final long serialVersionUID = -8167538283213097265L;
-  private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig();
-  private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
-          new HashSet<MediaType>(Arrays.asList(new MediaType[] {
-              MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"),
-              MediaType.image("x-ms-bmp"), MediaType.image("gif")
-  })));
-  private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>();
-
-  @Override
-  public Set<MediaType> getSupportedTypes(ParseContext context) {
-      // If Tesseract is installed, offer our supported image types
-      TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
-      if (hasTesseract(config))
-          return SUPPORTED_TYPES;
-      
-      // Otherwise don't advertise anything, so the other image parsers
-      //  can be selected instead
-      return Collections.emptySet();
-  }
-
-  private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
-    if (!config.getTesseractPath().isEmpty()) {
-      Map<String, String> env = pb.environment();
-      env.put("TESSDATA_PREFIX", config.getTesseractPath());
+    private static final long serialVersionUID = -8167538283213097265L;
+    private static final TesseractOCRConfig DEFAULT_CONFIG = new TesseractOCRConfig();
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
+            new HashSet<MediaType>(Arrays.asList(new MediaType[] {
+                    MediaType.image("png"), MediaType.image("jpeg"), MediaType.image("tiff"),
+                    MediaType.image("x-ms-bmp"), MediaType.image("gif")
+            })));
+    private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>();
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        // If Tesseract is installed, offer our supported image types
+        TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+        if (hasTesseract(config))
+            return SUPPORTED_TYPES;
+
+        // Otherwise don't advertise anything, so the other image parsers
+        //  can be selected instead
+        return Collections.emptySet();
     }
-  }
-  
-  private boolean hasTesseract(TesseractOCRConfig config) {
-      // Fetch where the config says to find Tesseract
-      String tesseract = config.getTesseractPath() + getTesseractProg();
-      
-      // Have we already checked for a copy of Tesseract there?
-      if (TESSERACT_PRESENT.containsKey(tesseract)) {
-          return TESSERACT_PRESENT.get(tesseract);
-      }
-      
-      // Try running Tesseract from there, and see if it exists + works
-      String[] checkCmd = { tesseract };
-      try {
-          boolean hasTesseract = ExternalParser.check(checkCmd);
-          TESSERACT_PRESENT.put(tesseract, hasTesseract);
-          return hasTesseract;
-      } catch (NoClassDefFoundError e) {
-          // This happens under OSGi + Fork Parser - see TIKA-1507
-          // As a workaround for now, just say we can't use OCR
-          // TODO Resolve it so we don't need this try/catch block
-          TESSERACT_PRESENT.put(tesseract, false);
-          return false;
-      }
-  }
-
-  public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
-      SAXException, TikaException {
-
-    TemporaryResources tmp = new TemporaryResources();
-    FileOutputStream fos = null;
-    TikaInputStream tis = null;
-    try {
-      int w = image.getWidth(null);
-      int h = image.getHeight(null);
-      BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
-      Graphics2D g2 = bImage.createGraphics();
-      g2.drawImage(image, 0, 0, null);
-      g2.dispose();
-      File file = tmp.createTemporaryFile();
-      fos = new FileOutputStream(file);
-      ImageIO.write(bImage, "png", fos);
-      bImage = null;
-      tis = TikaInputStream.get(file);
-      parse(tis, handler, metadata, context);
-
-    } finally {
-      tmp.dispose();
-      if (tis != null)
-        tis.close();
-      if (fos != null)
-        fos.close();
+
+    private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
+        if (!config.getTesseractPath().isEmpty()) {
+            Map<String, String> env = pb.environment();
+            env.put("TESSDATA_PREFIX", config.getTesseractPath());
+        }
     }
 
-  }
+    private boolean hasTesseract(TesseractOCRConfig config) {
+        // Fetch where the config says to find Tesseract
+        String tesseract = config.getTesseractPath() + getTesseractProg();
+
+        // Have we already checked for a copy of Tesseract there?
+        if (TESSERACT_PRESENT.containsKey(tesseract)) {
+            return TESSERACT_PRESENT.get(tesseract);
+        }
 
-  @Override
-  public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
-      throws IOException, SAXException, TikaException {
-    TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
-
-    // If Tesseract is not on the path with the current config, do not try to run OCR
-    // getSupportedTypes shouldn't have listed us as handling it, so this should only
-    //  occur if someone directly calls this parser, not via DefaultParser or similar
-    if (! hasTesseract(config))
-      return;
-
-    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-
-    TemporaryResources tmp = new TemporaryResources();
-    File output = null;
-    try {
-      TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
-      File input = tikaStream.getFile();
-      long size = tikaStream.getLength();
-
-      if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
-
-        output = tmp.createTemporaryFile();
-        doOCR(input, output, config);
-
-        // Tesseract appends .txt to output file name
-        output = new File(output.getAbsolutePath() + ".txt");
-
-        if (output.exists())
-          extractOutput(new FileInputStream(output), xhtml);
-
-      }
-
-      // Temporary workaround for TIKA-1445 - until we can specify
-      //  composite parsers with strategies (eg Composite, Try In Turn),
-      //  always send the image onwards to the regular parser to have
-      //  the metadata for them extracted as well
-      _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
-    } finally {
-      tmp.dispose();
-      if (output != null) {
-        output.delete();
-      }
+        // Try running Tesseract from there, and see if it exists + works
+        String[] checkCmd = { tesseract };
+        try {
+            boolean hasTesseract = ExternalParser.check(checkCmd);
+            TESSERACT_PRESENT.put(tesseract, hasTesseract);
+            return hasTesseract;
+        } catch (NoClassDefFoundError e) {
+            // This happens under OSGi + Fork Parser - see TIKA-1507
+            // As a workaround for now, just say we can't use OCR
+            // TODO Resolve it so we don't need this try/catch block
+            TESSERACT_PRESENT.put(tesseract, false);
+            return false;
+        }
     }
-  }
-  // TIKA-1445 workaround parser
-  private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
-  private static class CompositeImageParser extends CompositeParser {
-      private static final long serialVersionUID = -2398203346206381382L;
-      private static List<Parser> imageParsers = Arrays.asList(new Parser[]{
-          new ImageParser(), new JpegParser(), new TiffParser()
-      });
-      CompositeImageParser() {
-          super(new MediaTypeRegistry(), imageParsers);
-      }
-  }
-
-  /**
-   * Run external tesseract-ocr process.
-   * 
-   * @param input
-   *          File to be ocred
-   * @param output
-   *          File to collect ocr result
-   * @param config
-   *          Configuration of tesseract-ocr engine
-   * @throws TikaException
-   *           if the extraction timed out
-   * @throws IOException
-   *           if an input error occurred
-   */
-  private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
-    String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
-        config.getLanguage(), "-psm", config.getPageSegMode() };
-
-    ProcessBuilder pb = new ProcessBuilder(cmd);
-    setEnv(config, pb);
-    final Process process = pb.start();
-
-    process.getOutputStream().close();
-    InputStream out = process.getInputStream();
-    InputStream err = process.getErrorStream();
-
-    logStream("OCR MSG", out, input);
-    logStream("OCR ERROR", err, input);
-
-    FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
-      public Integer call() throws Exception {
-        return process.waitFor();
-      }
-    });
-
-    Thread waitThread = new Thread(waitTask);
-    waitThread.start();
-
-    try {
-      waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
-
-    } catch (InterruptedException e) {
-      waitThread.interrupt();
-      process.destroy();
-      Thread.currentThread().interrupt();
-      throw new TikaException("TesseractOCRParser interrupted", e);
-
-    } catch (ExecutionException e) {
-      // should not be thrown
-
-    } catch (TimeoutException e) {
-      waitThread.interrupt();
-      process.destroy();
-      throw new TikaException("TesseractOCRParser timeout", e);
+
+    public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+
+        TemporaryResources tmp = new TemporaryResources();
+        FileOutputStream fos = null;
+        TikaInputStream tis = null;
+        try {
+            int w = image.getWidth(null);
+            int h = image.getHeight(null);
+            BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
+            Graphics2D g2 = bImage.createGraphics();
+            g2.drawImage(image, 0, 0, null);
+            g2.dispose();
+            File file = tmp.createTemporaryFile();
+            fos = new FileOutputStream(file);
+            ImageIO.write(bImage, "png", fos);
+            bImage = null;
+            tis = TikaInputStream.get(file);
+            parse(tis, handler, metadata, context);
+
+        } finally {
+            tmp.dispose();
+            if (tis != null)
+                tis.close();
+            if (fos != null)
+                fos.close();
+        }
+
     }
 
-  }
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+
+        // If Tesseract is not on the path with the current config, do not try to run OCR
+        // getSupportedTypes shouldn't have listed us as handling it, so this should only
+        //  occur if someone directly calls this parser, not via DefaultParser or similar
+        if (! hasTesseract(config))
+            return;
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+        TemporaryResources tmp = new TemporaryResources();
+        File output = null;
+        try {
+            TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+            File input = tikaStream.getFile();
+            long size = tikaStream.getLength();
+
+            if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
+
+                output = tmp.createTemporaryFile();
+                doOCR(input, output, config);
+
+                // Tesseract appends .txt to output file name
+                output = new File(output.getAbsolutePath() + ".txt");
+
+                if (output.exists())
+                    extractOutput(new FileInputStream(output), xhtml);
 
-  /**
-   * Reads the contents of the given stream and write it to the given XHTML
-   * content handler. The stream is closed once fully processed.
-   * 
-   * @param stream
-   *          Stream where is the result of ocr
-   * @param xhtml
-   *          XHTML content handler
-   * @throws SAXException
-   *           if the XHTML SAX events could not be handled
-   * @throws IOException
-   *           if an input error occurred
-   */
-  private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
-
-    Reader reader = new InputStreamReader(stream, "UTF-8");
-    xhtml.startDocument();
-    xhtml.startElement("div");
-    try {
-      char[] buffer = new char[1024];
-      for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
-        if (n > 0)
-          xhtml.characters(buffer, 0, n);
-      }
-    } finally {
-      reader.close();
+            }
+
+            // Temporary workaround for TIKA-1445 - until we can specify
+            //  composite parsers with strategies (eg Composite, Try In Turn),
+            //  always send the image onwards to the regular parser to have
+            //  the metadata for them extracted as well
+            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
+        } finally {
+            tmp.dispose();
+            if (output != null) {
+                output.delete();
+            }
+        }
+    }
+    // TIKA-1445 workaround parser
+    private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
+    private static class CompositeImageParser extends CompositeParser {
+        private static final long serialVersionUID = -2398203346206381382L;
+        private static List<Parser> imageParsers = Arrays.asList(new Parser[]{
+                new ImageParser(), new JpegParser(), new TiffParser()
+        });
+        CompositeImageParser() {
+            super(new MediaTypeRegistry(), imageParsers);
+        }
     }
-    xhtml.endElement("div");
-    xhtml.endDocument();
-  }
-
-  /**
-   * Starts a thread that reads the contents of the standard output or error
-   * stream of the given process to not block the process. The stream is closed
-   * once fully processed.
-   */
-  private void logStream(final String logType, final InputStream stream, final File file) {
-    new Thread() {
-      public void run() {
-        Reader reader = new InputStreamReader(stream);
-        StringBuilder out = new StringBuilder();
-        char[] buffer = new char[1024];
+
+    /**
+     * Run external tesseract-ocr process.
+     *
+     * @param input
+     *          File to be ocred
+     * @param output
+     *          File to collect ocr result
+     * @param config
+     *          Configuration of tesseract-ocr engine
+     * @throws TikaException
+     *           if the extraction timed out
+     * @throws IOException
+     *           if an input error occurred
+     */
+    private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
+        String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
+                config.getLanguage(), "-psm", config.getPageSegMode() };
+
+        ProcessBuilder pb = new ProcessBuilder(cmd);
+        setEnv(config, pb);
+        final Process process = pb.start();
+
+        process.getOutputStream().close();
+        InputStream out = process.getInputStream();
+        InputStream err = process.getErrorStream();
+
+        logStream("OCR MSG", out, input);
+        logStream("OCR ERROR", err, input);
+
+        FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
+            public Integer call() throws Exception {
+                return process.waitFor();
+            }
+        });
+
+        Thread waitThread = new Thread(waitTask);
+        waitThread.start();
+
         try {
-          for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
-            out.append(buffer, 0, n);
-        } catch (IOException e) {
+            waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+
+        } catch (InterruptedException e) {
+            waitThread.interrupt();
+            process.destroy();
+            Thread.currentThread().interrupt();
+            throw new TikaException("TesseractOCRParser interrupted", e);
+
+        } catch (ExecutionException e) {
+            // should not be thrown
+
+        } catch (TimeoutException e) {
+            waitThread.interrupt();
+            process.destroy();
+            throw new TikaException("TesseractOCRParser timeout", e);
+        }
+
+    }
 
+    /**
+     * Reads the contents of the given stream and write it to the given XHTML
+     * content handler. The stream is closed once fully processed.
+     *
+     * @param stream
+     *          Stream where is the result of ocr
+     * @param xhtml
+     *          XHTML content handler
+     * @throws SAXException
+     *           if the XHTML SAX events could not be handled
+     * @throws IOException
+     *           if an input error occurred
+     */
+    private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
+
+        Reader reader = new InputStreamReader(stream, "UTF-8");
+        xhtml.startDocument();
+        xhtml.startElement("div");
+        try {
+            char[] buffer = new char[1024];
+            for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+                if (n > 0)
+                    xhtml.characters(buffer, 0, n);
+            }
         } finally {
-          IOUtils.closeQuietly(stream);
+            reader.close();
         }
+        xhtml.endElement("div");
+        xhtml.endDocument();
+    }
 
-        String msg = out.toString();
-        // log or discard message?
+    /**
+     * Starts a thread that reads the contents of the standard output or error
+     * stream of the given process to not block the process. The stream is closed
+     * once fully processed.
+     */
+    private void logStream(final String logType, final InputStream stream, final File file) {
+        new Thread() {
+            public void run() {
+                Reader reader = new InputStreamReader(stream);
+                StringBuilder out = new StringBuilder();
+                char[] buffer = new char[1024];
+                try {
+                    for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
+                        out.append(buffer, 0, n);
+                } catch (IOException e) {
+
+                } finally {
+                    IOUtils.closeQuietly(stream);
+                }
 
-      }
-    }.start();
-  }
-  
-  static String getTesseractProg() {
-    return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
-  }
+                String msg = out.toString();
+                // log or discard message?
+
+            }
+        }.start();
+    }
+
+    static String getTesseractProg() {
+        return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
+    }
 
 }