You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tika.apache.org by th...@apache.org on 2014/10/21 11:32:07 UTC

svn commit: r1633325 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java test/java/org/apache/tika/parser/mail/RFC822ParserTest.java

Author: thaichat04
Date: Tue Oct 21 09:32:06 2014
New Revision: 1633325

URL: http://svn.apache.org/r1633325
Log:
TIKA-1422 - Apply fix of [~olegt] in Windows

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1633325&r1=1633324&r2=1633325&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Tue Oct 21 09:32:06 2014
@@ -26,11 +26,11 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.util.ArrayList;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.List;
-import java.util.ArrayList;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.FutureTask;
@@ -45,20 +45,23 @@ import org.apache.tika.io.TemporaryResou
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.image.ImageParser;
+import org.apache.tika.parser.image.PSDParser;
+import org.apache.tika.parser.image.TiffParser;
+import org.apache.tika.parser.jpeg.JpegParser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 /**
- * TesseractOCRParser powered by tesseract-ocr engine.
- * To enable this parser, create a {@link TesseractOCRConfig}
- * object and pass it through a ParseContext.
- * Tesseract-ocr must be installed and on system path or
- * the path to its root folder must be provided:
+ * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
+ * create a {@link TesseractOCRConfig} object and pass it through a
+ * ParseContext. Tesseract-ocr must be installed and on system path or the path
+ * to its root folder must be provided:
  * <p>
  * TesseractOCRConfig config = new TesseractOCRConfig();<br>
  * //Needed if tesseract is not on system path<br>
@@ -69,226 +72,231 @@ import org.xml.sax.SAXException;
  * 
  */
 public class TesseractOCRParser extends AbstractParser {
-	
-	private static final long serialVersionUID = 1L;
-	
-	private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
-	
-	private static Set<MediaType> getTypes() {
-		HashSet<MediaType> supportedTypes = new HashSet<MediaType>();
-		
-		supportedTypes.add(MediaType.image("png"));
-		supportedTypes.add(MediaType.image("jpeg"));
-		supportedTypes.add(MediaType.image("tiff"));
-		supportedTypes.add(MediaType.image("x-ms-bmp"));
-		supportedTypes.add(MediaType.image("gif"));
-		
-		return supportedTypes;
-	}
-	
-	@Override
-	public Set<MediaType> getSupportedTypes(ParseContext arg0) {
-		return SUPPORTED_TYPES;
-	}
-
-    private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
-        if(!config.getTesseractPath().isEmpty()){
-            Map<String, String> env = pb.environment();
-            env.put("TESSDATA_PREFIX", config.getTesseractPath());
-        }
+
+  private static final long serialVersionUID = 1L;
+
+  private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
+
+  private static Set<MediaType> getTypes() {
+    HashSet<MediaType> supportedTypes = new HashSet<MediaType>();
+
+    supportedTypes.add(MediaType.image("png"));
+    supportedTypes.add(MediaType.image("jpeg"));
+    supportedTypes.add(MediaType.image("tiff"));
+    supportedTypes.add(MediaType.image("x-ms-bmp"));
+    supportedTypes.add(MediaType.image("gif"));
+
+    return supportedTypes;
+  }
+
+  @Override
+  public Set<MediaType> getSupportedTypes(ParseContext arg0) {
+    return SUPPORTED_TYPES;
+  }
+
+  private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
+    if (!config.getTesseractPath().isEmpty()) {
+      Map<String, String> env = pb.environment();
+      env.put("TESSDATA_PREFIX", config.getTesseractPath());
     }
-	
-	public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-
-		TemporaryResources tmp = new TemporaryResources();
-		FileOutputStream fos = null;
-		TikaInputStream tis = null;
-		try{
-			int w = image.getWidth(null);
-	        int h = image.getHeight(null);
-	        BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
-	        Graphics2D g2 = bImage.createGraphics();
-	        g2.drawImage(image, 0, 0, null);
-	        g2.dispose();
-	        File file = tmp.createTemporaryFile();
-			fos = new FileOutputStream(file);
-			ImageIO.write(bImage, "png", fos);
-			bImage = null;
-			tis = TikaInputStream.get(file);
-			parse(tis, handler, metadata, context);
-			
-		}finally{
-			tmp.dispose();
-			if(tis != null)
-				tis.close();
-			if(fos != null)
-				fos.close();
-		}
-		
-		
-	}
-
-	@Override
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-
-    	TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
-    	if(config == null) config = new TesseractOCRConfig();
-
-        String[] checkCmd = {config.getTesseractPath() + "tesseract"};
-        // If Tesseract is not on the path, do not try to run OCR.
-        if (!ExternalParser.check(checkCmd)) return;
-    	
-    	XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+  }
 
-        TemporaryResources tmp = new TemporaryResources();
-        File output = null;
-        try {
-        	TikaInputStream  tikaStream = TikaInputStream.get(stream, tmp);
-        	File input = tikaStream.getFile();
-        	long size = tikaStream.getLength();
-        	
-        	if(size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()){
-        		
-            	output = tmp.createTemporaryFile();
-            	doOCR(input, output, config);
-            	
-                //Tesseract appends .txt to output file name
-                output = new File(output.getAbsolutePath() + ".txt");
-                
-                if(output.exists())
-                	extractOutput(new FileInputStream(output), xhtml);
+  public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
+      SAXException, TikaException {
 
-        	}
-        
-        } finally {
-        	tmp.dispose();
-        	if(output != null)
-        		output.delete();
-            
-        }
+    TemporaryResources tmp = new TemporaryResources();
+    FileOutputStream fos = null;
+    TikaInputStream tis = null;
+    try {
+      int w = image.getWidth(null);
+      int h = image.getHeight(null);
+      BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
+      Graphics2D g2 = bImage.createGraphics();
+      g2.drawImage(image, 0, 0, null);
+      g2.dispose();
+      File file = tmp.createTemporaryFile();
+      fos = new FileOutputStream(file);
+      ImageIO.write(bImage, "png", fos);
+      bImage = null;
+      tis = TikaInputStream.get(file);
+      parse(tis, handler, metadata, context);
+
+    } finally {
+      tmp.dispose();
+      if (tis != null)
+        tis.close();
+      if (fos != null)
+        fos.close();
     }
 
-	/**
-	 * Run external tesseract-ocr process.
-	 * @param input File to be ocred
-     * @param output File to collect ocr result
-     * @param config Configuration of tesseract-ocr engine
-     * @throws TikaException if the extraction timed out
-     * @throws IOException if an input error occurred
-	 */
-    private void doOCR(File input, File output, TesseractOCRConfig config)
-            throws IOException, TikaException {
-        String[] cmd = {config.getTesseractPath() + "tesseract",
-    					input.getPath(), 
-						output.getPath() , 
-						"-l", 
-						config.getLanguage() , 
-						"-psm", 
-						config.getPageSegMode()	};
-            
-        ProcessBuilder pb = new ProcessBuilder(cmd);
-        setEnv(config, pb);
-        final Process process = pb.start();
-            
-        process.getOutputStream().close();
-        InputStream out = process.getInputStream();
-        InputStream err = process.getErrorStream();
-            
-        logStream("OCR MSG", out, input);
-        logStream("OCR ERROR", err, input);
-           
-        FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
-        	public Integer call() throws Exception {
-          	    return process.waitFor();
-          	}
-        });
-
-        Thread waitThread = new Thread(waitTask);
-        waitThread.start();
-          
-        try {
-        	waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
-              
-        } catch (InterruptedException e) {
-        	waitThread.interrupt();
-          	process.destroy();
-          	Thread.currentThread().interrupt();
-          	throw new TikaException("TesseractOCRParser interrupted", e);
-          	
-        } catch (ExecutionException e) {
-			//should not be thrown
-				
-		} catch (TimeoutException e) {
-			waitThread.interrupt();
-			process.destroy();
-			throw new TikaException("TesseractOCRParser timeout", e);
-		}
-            	
-            
+  }
+
+  @Override
+  public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+      throws IOException, SAXException, TikaException {
+
+    TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
+    if (config == null)
+      config = new TesseractOCRConfig();
+
+    String[] checkCmd = { config.getTesseractPath() + "tesseract" };
+    // If Tesseract is not on the path, do not try to run OCR.
+    if (!ExternalParser.check(checkCmd))
+      return;
+
+    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+    TemporaryResources tmp = new TemporaryResources();
+    File output = null;
+    try {
+      TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+      File input = tikaStream.getFile();
+      long size = tikaStream.getLength();
+
+      if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
+
+        output = tmp.createTemporaryFile();
+        doOCR(input, output, config);
+
+        // Tesseract appends .txt to output file name
+        output = new File(output.getAbsolutePath() + ".txt");
+
+        if (output.exists())
+          extractOutput(new FileInputStream(output), xhtml);
+
+      }
+
+    } finally {
+      tmp.dispose();
+      if (output != null)
+        output.delete();
+
     }
-    
+  }
 
-    /**
-     * Reads the contents of the given stream and write it to the 
-     * given XHTML content handler.
-     * The stream is closed once fully processed.
-     *
-     * @param stream Stream where is the result of ocr
-     * @param xhtml XHTML content handler
-     * @throws SAXException if the XHTML SAX events could not be handled
-     * @throws IOException if an input error occurred
-     */
-    private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
-	throws SAXException, IOException {
- 
-        Reader reader = new InputStreamReader(stream, "UTF-8");
-        xhtml.startDocument();
-        xhtml.startElement("div");
-        try {
-            char[] buffer = new char[1024];
-            for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
-                if (n > 0) xhtml.characters(buffer, 0, n);
-            }
-        } finally {
-            reader.close();
-        }
-        xhtml.endElement("div");
-        xhtml.endDocument();
+  /**
+   * Run external tesseract-ocr process.
+   * 
+   * @param input
+   *          File to be ocred
+   * @param output
+   *          File to collect ocr result
+   * @param config
+   *          Configuration of tesseract-ocr engine
+   * @throws TikaException
+   *           if the extraction timed out
+   * @throws IOException
+   *           if an input error occurred
+   */
+  private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
+    String[] cmd = { config.getTesseractPath() + "tesseract", input.getPath(), output.getPath(), "-l",
+        config.getLanguage(), "-psm", config.getPageSegMode() };
+
+    ProcessBuilder pb = new ProcessBuilder(cmd);
+    setEnv(config, pb);
+    final Process process = pb.start();
+
+    process.getOutputStream().close();
+    InputStream out = process.getInputStream();
+    InputStream err = process.getErrorStream();
+
+    logStream("OCR MSG", out, input);
+    logStream("OCR ERROR", err, input);
+
+    FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
+      public Integer call() throws Exception {
+        return process.waitFor();
+      }
+    });
+
+    Thread waitThread = new Thread(waitTask);
+    waitThread.start();
+
+    try {
+      waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+
+    } catch (InterruptedException e) {
+      waitThread.interrupt();
+      process.destroy();
+      Thread.currentThread().interrupt();
+      throw new TikaException("TesseractOCRParser interrupted", e);
+
+    } catch (ExecutionException e) {
+      // should not be thrown
+
+    } catch (TimeoutException e) {
+      waitThread.interrupt();
+      process.destroy();
+      throw new TikaException("TesseractOCRParser timeout", e);
     }
 
-    /**
-     * Starts a thread that reads the contents of the standard output
-     * or error stream of the given process to not block the process.
-     * The stream is closed once fully processed.
-     */
-    private void logStream(final String logType, final InputStream stream, final File file) {
-        new Thread() {
-            public void run() {
-            	Reader reader = new InputStreamReader(stream);
-                StringBuilder out = new StringBuilder();
-                char[] buffer = new char[1024];
-                try {
-					for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) 
-						out.append(buffer, 0, n);
-				} catch (IOException e) {
-					
-				} finally {
-                    IOUtils.closeQuietly(stream);
-                }
-			
-				
-				String msg = out.toString();
-				//log or discard message?
-				
-            }
-        }.start();
+  }
+
+  /**
+   * Reads the contents of the given stream and write it to the given XHTML
+   * content handler. The stream is closed once fully processed.
+   * 
+   * @param stream
+   *          Stream where is the result of ocr
+   * @param xhtml
+   *          XHTML content handler
+   * @throws SAXException
+   *           if the XHTML SAX events could not be handled
+   * @throws IOException
+   *           if an input error occurred
+   */
+  private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
+
+    Reader reader = new InputStreamReader(stream, "UTF-8");
+    xhtml.startDocument();
+    xhtml.startElement("div");
+    try {
+      char[] buffer = new char[1024];
+      for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+        if (n > 0)
+          xhtml.characters(buffer, 0, n);
+      }
+    } finally {
+      reader.close();
     }
+    xhtml.endElement("div");
+    xhtml.endDocument();
+  }
+
+  /**
+   * Starts a thread that reads the contents of the standard output or error
+   * stream of the given process to not block the process. The stream is closed
+   * once fully processed.
+   */
+  private void logStream(final String logType, final InputStream stream, final File file) {
+    new Thread() {
+      public void run() {
+        Reader reader = new InputStreamReader(stream);
+        StringBuilder out = new StringBuilder();
+        char[] buffer = new char[1024];
+        try {
+          for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
+            out.append(buffer, 0, n);
+        } catch (IOException e) {
 
-	
-}
+        } finally {
+          IOUtils.closeQuietly(stream);
+        }
 
+        String msg = out.toString();
+        // log or discard message?
 
+      }
+    }.start();
+  }
+
+  private List<Parser> getImageParsers() {
+    List<Parser> parsers = new ArrayList<Parser>();
+    parsers.add(new ImageParser());
+    parsers.add(new PSDParser());
+    parsers.add(new TiffParser());
+    parsers.add(new JpegParser());
+    return parsers;
+  }
+
+}

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1633325&r1=1633324&r2=1633325&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Tue Oct 21 09:32:06 2014
@@ -36,6 +36,8 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParserTest;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.junit.Test;
@@ -83,13 +85,19 @@ public class RFC822ParserTest {
         try {
             parser.parse(stream, handler, metadata, new ParseContext());
             verify(handler).startDocument();
-            //4 body-part divs -- two outer bodies and two inner bodies
-            verify(handler, times(4)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
-            verify(handler, times(4)).endElement(XHTMLContentHandler.XHTML, "div", "div");
-            //5 paragraph elements, 4 for body-parts and 1 for encompassing message
-            verify(handler, times(5)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
-            verify(handler, times(5)).endElement(XHTMLContentHandler.XHTML, "p", "p");
+            int bodyExpectedTimes = 4, multipackExpectedTimes = 5;;
+            int invokingTimes = bodyExpectedTimes;
+            TesseractOCRConfig config = new TesseractOCRConfig();
+            if (TesseractOCRParserTest.canRun(config)) {
+              invokingTimes = multipackExpectedTimes;
+            }
+            
+            verify(handler, times(invokingTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
+            verify(handler, times(invokingTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div");
+            verify(handler, times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
+            verify(handler, times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "p", "p");
             verify(handler).endDocument();
+            
         } catch (Exception e) {
             fail("Exception thrown: " + e.getMessage());
         }

Re: svn commit: r1633325 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java test/java/org/apache/tika/parser/mail/RFC822ParserTest.java

Posted by "Mattmann, Chris A (3980)" <ch...@jpl.nasa.gov>.

No worries Hong-Thai! Will update and test, thanks!

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Chris Mattmann, Ph.D.
Chief Architect
Instrument Software and Science Data Systems Section (398)
NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA
Office: 168-519, Mailstop: 168-527
Email: chris.a.mattmann@nasa.gov
WWW:  http://sunset.usc.edu/~mattmann/
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adjunct Associate Professor, Computer Science Department
University of Southern California, Los Angeles, CA 90089 USA
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++






-----Original Message-----
From: Hong-Thai Nguyen <th...@gmail.com>
Reply-To: "dev@tika.apache.org" <de...@tika.apache.org>
Date: Tuesday, October 21, 2014 at 6:57 AM
To: "dev@tika.apache.org" <de...@tika.apache.org>
Subject: Re: svn commit: r1633325 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
test/java/org/apache/tika/parser/mail/RFC822ParserTest.java

>Hi Chris,
>
>Yes, I made a mistake on this commit by missing a renaming file and broke
>build, the next commit corrected:
>Revision: 1633331
>Author: thaichat04
>Date: mardi 21 octobre 2014 11:47:54
>Message:
>TIKA-1422 - Fixing build & minor refactory of naming test class
>----
>Modified :
>/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822P
>arserTest.java
>Added :
>/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/Tesserac
>tOCRParserTest.java
>Deleted :
>/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/Tesserac
>tOCRTest.java
>
>Please 'pull' latest again then tell me if OK ?
>
>Sorry
>
>On Tue, Oct 21, 2014 at 3:49 PM, Mattmann, Chris A (3980) <
>chris.a.mattmann@jpl.nasa.gov> wrote:
>
>> Hi Hong-Thai,
>>
>> These commits look strange to me - it looks like it subtracts the
>> whole files (and the unit test removed the test file, renamed it,
>> and then added what largely looks like the same file, back?)
>>
>> Any idea what¹s up?
>>
>> Cheers,
>> Chris
>>
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> Chris Mattmann, Ph.D.
>> Chief Architect
>> Instrument Software and Science Data Systems Section (398)
>> NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA
>> Office: 168-519, Mailstop: 168-527
>> Email: chris.a.mattmann@nasa.gov
>> WWW:  http://sunset.usc.edu/~mattmann/
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> Adjunct Associate Professor, Computer Science Department
>> University of Southern California, Los Angeles, CA 90089 USA
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>
>>
>>
>>
>>
>>
>> -----Original Message-----
>> From: "thaichat04@apache.org" <th...@apache.org>
>> Reply-To: "dev@tika.apache.org" <de...@tika.apache.org>
>> Date: Tuesday, October 21, 2014 at 2:32 AM
>> To: "commits@tika.apache.org" <co...@tika.apache.org>
>> Subject: svn commit: r1633325 - in /tika/trunk/tika-parsers/src:
>> main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
>> test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
>>
>> >Author: thaichat04
>> >Date: Tue Oct 21 09:32:06 2014
>> >New Revision: 1633325
>> >
>> >URL: http://svn.apache.org/r1633325
>> >Log:
>> >TIKA-1422 - Apply fix of [~olegt] in Windows
>> >
>> >Modified:
>> >
>> 
>>>tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tessera
>>>ct
>> >OCRParser.java
>> >
>> 
>>>tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822
>>>Pa
>> >rserTest.java
>> >
>> >Modified:
>> 
>>>tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tessera
>>>ct
>> >OCRParser.java
>> >URL:
>> >
>> 
>>http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/ap
>>a
>> 
>>>che/tika/parser/ocr/TesseractOCRParser.java?rev=1633325&r1=1633324&r2=16
>>>33
>> >325&view=diff
>> 
>>>========================================================================
>>>==
>> >====
>> >---
>> 
>>>tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tessera
>>>ct
>> >OCRParser.java (original)
>> >+++
>> 
>>>tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tessera
>>>ct
>> >OCRParser.java Tue Oct 21 09:32:06 2014
>> >@@ -26,11 +26,11 @@ import java.io.IOException;
>> > import java.io.InputStream;
>> > import java.io.InputStreamReader;
>> > import java.io.Reader;
>> >+import java.util.ArrayList;
>> > import java.util.HashSet;
>> >+import java.util.List;
>> > import java.util.Map;
>> > import java.util.Set;
>> >-import java.util.List;
>> >-import java.util.ArrayList;
>> > import java.util.concurrent.Callable;
>> > import java.util.concurrent.ExecutionException;
>> > import java.util.concurrent.FutureTask;
>> >@@ -45,20 +45,23 @@ import org.apache.tika.io.TemporaryResou
>> > import org.apache.tika.io.TikaInputStream;
>> > import org.apache.tika.metadata.Metadata;
>> > import org.apache.tika.mime.MediaType;
>> >-import org.apache.tika.parser.Parser;
>> > import org.apache.tika.parser.AbstractParser;
>> > import org.apache.tika.parser.ParseContext;
>> >+import org.apache.tika.parser.Parser;
>> > import org.apache.tika.parser.external.ExternalParser;
>> >+import org.apache.tika.parser.image.ImageParser;
>> >+import org.apache.tika.parser.image.PSDParser;
>> >+import org.apache.tika.parser.image.TiffParser;
>> >+import org.apache.tika.parser.jpeg.JpegParser;
>> > import org.apache.tika.sax.XHTMLContentHandler;
>> > import org.xml.sax.ContentHandler;
>> > import org.xml.sax.SAXException;
>> >
>> > /**
>> >- * TesseractOCRParser powered by tesseract-ocr engine.
>> >- * To enable this parser, create a {@link TesseractOCRConfig}
>> >- * object and pass it through a ParseContext.
>> >- * Tesseract-ocr must be installed and on system path or
>> >- * the path to its root folder must be provided:
>> >+ * TesseractOCRParser powered by tesseract-ocr engine. To enable this
>> >parser,
>> >+ * create a {@link TesseractOCRConfig} object and pass it through a
>> >+ * ParseContext. Tesseract-ocr must be installed and on system path or
>> >the path
>> >+ * to its root folder must be provided:
>> >  * <p>
>> >  * TesseractOCRConfig config = new TesseractOCRConfig();<br>
>> >  * //Needed if tesseract is not on system path<br>
>> >@@ -69,226 +72,231 @@ import org.xml.sax.SAXException;
>> >  *
>> >  */
>> > public class TesseractOCRParser extends AbstractParser {
>> >-
>> >-      private static final long serialVersionUID = 1L;
>> >-
>> >-      private static final Set<MediaType> SUPPORTED_TYPES =
>>getTypes();
>> >-
>> >-      private static Set<MediaType> getTypes() {
>> >-              HashSet<MediaType> supportedTypes = new
>> HashSet<MediaType>();
>> >-
>> >-              supportedTypes.add(MediaType.image("png"));
>> >-              supportedTypes.add(MediaType.image("jpeg"));
>> >-              supportedTypes.add(MediaType.image("tiff"));
>> >-              supportedTypes.add(MediaType.image("x-ms-bmp"));
>> >-              supportedTypes.add(MediaType.image("gif"));
>> >-
>> >-              return supportedTypes;
>> >-      }
>> >-
>> >-      @Override
>> >-      public Set<MediaType> getSupportedTypes(ParseContext arg0) {
>> >-              return SUPPORTED_TYPES;
>> >-      }
>> >-
>> >-    private void setEnv(TesseractOCRConfig config, ProcessBuilder pb)
>>{
>> >-        if(!config.getTesseractPath().isEmpty()){
>> >-            Map<String, String> env = pb.environment();
>> >-            env.put("TESSDATA_PREFIX", config.getTesseractPath());
>> >-        }
>> >+
>> >+  private static final long serialVersionUID = 1L;
>> >+
>> >+  private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
>> >+
>> >+  private static Set<MediaType> getTypes() {
>> >+    HashSet<MediaType> supportedTypes = new HashSet<MediaType>();
>> >+
>> >+    supportedTypes.add(MediaType.image("png"));
>> >+    supportedTypes.add(MediaType.image("jpeg"));
>> >+    supportedTypes.add(MediaType.image("tiff"));
>> >+    supportedTypes.add(MediaType.image("x-ms-bmp"));
>> >+    supportedTypes.add(MediaType.image("gif"));
>> >+
>> >+    return supportedTypes;
>> >+  }
>> >+
>> >+  @Override
>> >+  public Set<MediaType> getSupportedTypes(ParseContext arg0) {
>> >+    return SUPPORTED_TYPES;
>> >+  }
>> >+
>> >+  private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
>> >+    if (!config.getTesseractPath().isEmpty()) {
>> >+      Map<String, String> env = pb.environment();
>> >+      env.put("TESSDATA_PREFIX", config.getTesseractPath());
>> >     }
>> >-
>> >-      public void parse(Image image, ContentHandler handler, Metadata
>> >metadata, ParseContext context)
>> >-            throws IOException, SAXException, TikaException {
>> >-
>> >-              TemporaryResources tmp = new TemporaryResources();
>> >-              FileOutputStream fos = null;
>> >-              TikaInputStream tis = null;
>> >-              try{
>> >-                      int w = image.getWidth(null);
>> >-              int h = image.getHeight(null);
>> >-              BufferedImage bImage = new BufferedImage(w, h,
>> >BufferedImage.TYPE_INT_RGB);
>> >-              Graphics2D g2 = bImage.createGraphics();
>> >-              g2.drawImage(image, 0, 0, null);
>> >-              g2.dispose();
>> >-              File file = tmp.createTemporaryFile();
>> >-                      fos = new FileOutputStream(file);
>> >-                      ImageIO.write(bImage, "png", fos);
>> >-                      bImage = null;
>> >-                      tis = TikaInputStream.get(file);
>> >-                      parse(tis, handler, metadata, context);
>> >-
>> >-              }finally{
>> >-                      tmp.dispose();
>> >-                      if(tis != null)
>> >-                              tis.close();
>> >-                      if(fos != null)
>> >-                              fos.close();
>> >-              }
>> >-
>> >-
>> >-      }
>> >-
>> >-      @Override
>> >-    public void parse(
>> >-            InputStream stream, ContentHandler handler,
>> >-            Metadata metadata, ParseContext context)
>> >-            throws IOException, SAXException, TikaException {
>> >-
>> >-      TesseractOCRConfig config =
>>context.get(TesseractOCRConfig.class);
>> >-      if(config == null) config = new TesseractOCRConfig();
>> >-
>> >-        String[] checkCmd = {config.getTesseractPath() + "tesseract"};
>> >-        // If Tesseract is not on the path, do not try to run OCR.
>> >-        if (!ExternalParser.check(checkCmd)) return;
>> >-
>> >-      XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
>> >metadata);
>> >+  }
>> >
>> >-        TemporaryResources tmp = new TemporaryResources();
>> >-        File output = null;
>> >-        try {
>> >-              TikaInputStream  tikaStream =
>>TikaInputStream.get(stream,
>> tmp);
>> >-              File input = tikaStream.getFile();
>> >-              long size = tikaStream.getLength();
>> >-
>> >-              if(size >= config.getMinFileSizeToOcr() && size <=
>> >config.getMaxFileSizeToOcr()){
>> >-
>> >-              output = tmp.createTemporaryFile();
>> >-              doOCR(input, output, config);
>> >-
>> >-                //Tesseract appends .txt to output file name
>> >-                output = new File(output.getAbsolutePath() + ".txt");
>> >-
>> >-                if(output.exists())
>> >-                      extractOutput(new FileInputStream(output),
>>xhtml);
>> >+  public void parse(Image image, ContentHandler handler, Metadata
>> >metadata, ParseContext context) throws IOException,
>> >+      SAXException, TikaException {
>> >
>> >-              }
>> >-
>> >-        } finally {
>> >-              tmp.dispose();
>> >-              if(output != null)
>> >-                      output.delete();
>> >-
>> >-        }
>> >+    TemporaryResources tmp = new TemporaryResources();
>> >+    FileOutputStream fos = null;
>> >+    TikaInputStream tis = null;
>> >+    try {
>> >+      int w = image.getWidth(null);
>> >+      int h = image.getHeight(null);
>> >+      BufferedImage bImage = new BufferedImage(w, h,
>> >BufferedImage.TYPE_INT_RGB);
>> >+      Graphics2D g2 = bImage.createGraphics();
>> >+      g2.drawImage(image, 0, 0, null);
>> >+      g2.dispose();
>> >+      File file = tmp.createTemporaryFile();
>> >+      fos = new FileOutputStream(file);
>> >+      ImageIO.write(bImage, "png", fos);
>> >+      bImage = null;
>> >+      tis = TikaInputStream.get(file);
>> >+      parse(tis, handler, metadata, context);
>> >+
>> >+    } finally {
>> >+      tmp.dispose();
>> >+      if (tis != null)
>> >+        tis.close();
>> >+      if (fos != null)
>> >+        fos.close();
>> >     }
>> >
>> >-      /**
>> >-       * Run external tesseract-ocr process.
>> >-       * @param input File to be ocred
>> >-     * @param output File to collect ocr result
>> >-     * @param config Configuration of tesseract-ocr engine
>> >-     * @throws TikaException if the extraction timed out
>> >-     * @throws IOException if an input error occurred
>> >-       */
>> >-    private void doOCR(File input, File output, TesseractOCRConfig
>> >config)
>> >-            throws IOException, TikaException {
>> >-        String[] cmd = {config.getTesseractPath() + "tesseract",
>> >-                                      input.getPath(),
>> >-                                              output.getPath() ,
>> >-                                              "-l",
>> >-                                              config.getLanguage() ,
>> >-                                              "-psm",
>> >-                                              config.getPageSegMode()
>>};
>> >-
>> >-        ProcessBuilder pb = new ProcessBuilder(cmd);
>> >-        setEnv(config, pb);
>> >-        final Process process = pb.start();
>> >-
>> >-        process.getOutputStream().close();
>> >-        InputStream out = process.getInputStream();
>> >-        InputStream err = process.getErrorStream();
>> >-
>> >-        logStream("OCR MSG", out, input);
>> >-        logStream("OCR ERROR", err, input);
>> >-
>> >-        FutureTask<Integer> waitTask = new FutureTask<Integer>(new
>> >Callable<Integer>() {
>> >-              public Integer call() throws Exception {
>> >-                  return process.waitFor();
>> >-              }
>> >-        });
>> >-
>> >-        Thread waitThread = new Thread(waitTask);
>> >-        waitThread.start();
>> >-
>> >-        try {
>> >-              waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
>> >-
>> >-        } catch (InterruptedException e) {
>> >-              waitThread.interrupt();
>> >-              process.destroy();
>> >-              Thread.currentThread().interrupt();
>> >-              throw new TikaException("TesseractOCRParser
>>interrupted",
>> e);
>> >-
>> >-        } catch (ExecutionException e) {
>> >-                      //should not be thrown
>> >-
>> >-              } catch (TimeoutException e) {
>> >-                      waitThread.interrupt();
>> >-                      process.destroy();
>> >-                      throw new TikaException("TesseractOCRParser
>> timeout", e);
>> >-              }
>> >-
>> >-
>> >+  }
>> >+
>> >+  @Override
>> >+  public void parse(InputStream stream, ContentHandler handler,
>>Metadata
>> >metadata, ParseContext context)
>> >+      throws IOException, SAXException, TikaException {
>> >+
>> >+    TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
>> >+    if (config == null)
>> >+      config = new TesseractOCRConfig();
>> >+
>> >+    String[] checkCmd = { config.getTesseractPath() + "tesseract" };
>> >+    // If Tesseract is not on the path, do not try to run OCR.
>> >+    if (!ExternalParser.check(checkCmd))
>> >+      return;
>> >+
>> >+    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
>> >metadata);
>> >+
>> >+    TemporaryResources tmp = new TemporaryResources();
>> >+    File output = null;
>> >+    try {
>> >+      TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
>> >+      File input = tikaStream.getFile();
>> >+      long size = tikaStream.getLength();
>> >+
>> >+      if (size >= config.getMinFileSizeToOcr() && size <=
>> >config.getMaxFileSizeToOcr()) {
>> >+
>> >+        output = tmp.createTemporaryFile();
>> >+        doOCR(input, output, config);
>> >+
>> >+        // Tesseract appends .txt to output file name
>> >+        output = new File(output.getAbsolutePath() + ".txt");
>> >+
>> >+        if (output.exists())
>> >+          extractOutput(new FileInputStream(output), xhtml);
>> >+
>> >+      }
>> >+
>> >+    } finally {
>> >+      tmp.dispose();
>> >+      if (output != null)
>> >+        output.delete();
>> >+
>> >     }
>> >-
>> >+  }
>> >
>> >-    /**
>> >-     * Reads the contents of the given stream and write it to the
>> >-     * given XHTML content handler.
>> >-     * The stream is closed once fully processed.
>> >-     *
>> >-     * @param stream Stream where is the result of ocr
>> >-     * @param xhtml XHTML content handler
>> >-     * @throws SAXException if the XHTML SAX events could not be
>>handled
>> >-     * @throws IOException if an input error occurred
>> >-     */
>> >-    private void extractOutput(InputStream stream, XHTMLContentHandler
>> >xhtml)
>> >-      throws SAXException, IOException {
>> >-
>> >-        Reader reader = new InputStreamReader(stream, "UTF-8");
>> >-        xhtml.startDocument();
>> >-        xhtml.startElement("div");
>> >-        try {
>> >-            char[] buffer = new char[1024];
>> >-            for (int n = reader.read(buffer); n != -1; n =
>> >reader.read(buffer)) {
>> >-                if (n > 0) xhtml.characters(buffer, 0, n);
>> >-            }
>> >-        } finally {
>> >-            reader.close();
>> >-        }
>> >-        xhtml.endElement("div");
>> >-        xhtml.endDocument();
>> >+  /**
>> >+   * Run external tesseract-ocr process.
>> >+   *
>> >+   * @param input
>> >+   *          File to be ocred
>> >+   * @param output
>> >+   *          File to collect ocr result
>> >+   * @param config
>> >+   *          Configuration of tesseract-ocr engine
>> >+   * @throws TikaException
>> >+   *           if the extraction timed out
>> >+   * @throws IOException
>> >+   *           if an input error occurred
>> >+   */
>> >+  private void doOCR(File input, File output, TesseractOCRConfig
>>config)
>> >throws IOException, TikaException {
>> >+    String[] cmd = { config.getTesseractPath() + "tesseract",
>> >input.getPath(), output.getPath(), "-l",
>> >+        config.getLanguage(), "-psm", config.getPageSegMode() };
>> >+
>> >+    ProcessBuilder pb = new ProcessBuilder(cmd);
>> >+    setEnv(config, pb);
>> >+    final Process process = pb.start();
>> >+
>> >+    process.getOutputStream().close();
>> >+    InputStream out = process.getInputStream();
>> >+    InputStream err = process.getErrorStream();
>> >+
>> >+    logStream("OCR MSG", out, input);
>> >+    logStream("OCR ERROR", err, input);
>> >+
>> >+    FutureTask<Integer> waitTask = new FutureTask<Integer>(new
>> >Callable<Integer>() {
>> >+      public Integer call() throws Exception {
>> >+        return process.waitFor();
>> >+      }
>> >+    });
>> >+
>> >+    Thread waitThread = new Thread(waitTask);
>> >+    waitThread.start();
>> >+
>> >+    try {
>> >+      waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
>> >+
>> >+    } catch (InterruptedException e) {
>> >+      waitThread.interrupt();
>> >+      process.destroy();
>> >+      Thread.currentThread().interrupt();
>> >+      throw new TikaException("TesseractOCRParser interrupted", e);
>> >+
>> >+    } catch (ExecutionException e) {
>> >+      // should not be thrown
>> >+
>> >+    } catch (TimeoutException e) {
>> >+      waitThread.interrupt();
>> >+      process.destroy();
>> >+      throw new TikaException("TesseractOCRParser timeout", e);
>> >     }
>> >
>> >-    /**
>> >-     * Starts a thread that reads the contents of the standard output
>> >-     * or error stream of the given process to not block the process.
>> >-     * The stream is closed once fully processed.
>> >-     */
>> >-    private void logStream(final String logType, final InputStream
>> >stream, final File file) {
>> >-        new Thread() {
>> >-            public void run() {
>> >-              Reader reader = new InputStreamReader(stream);
>> >-                StringBuilder out = new StringBuilder();
>> >-                char[] buffer = new char[1024];
>> >-                try {
>> >-                                      for (int n =
>>reader.read(buffer);
>> n != -1; n = reader.read(buffer))
>> >-                                              out.append(buffer, 0,
>>n);
>> >-                              } catch (IOException e) {
>> >-
>> >-                              } finally {
>> >-                    IOUtils.closeQuietly(stream);
>> >-                }
>> >-
>> >-
>> >-                              String msg = out.toString();
>> >-                              //log or discard message?
>> >-
>> >-            }
>> >-        }.start();
>> >+  }
>> >+
>> >+  /**
>> >+   * Reads the contents of the given stream and write it to the given
>> >XHTML
>> >+   * content handler. The stream is closed once fully processed.
>> >+   *
>> >+   * @param stream
>> >+   *          Stream where is the result of ocr
>> >+   * @param xhtml
>> >+   *          XHTML content handler
>> >+   * @throws SAXException
>> >+   *           if the XHTML SAX events could not be handled
>> >+   * @throws IOException
>> >+   *           if an input error occurred
>> >+   */
>> >+  private void extractOutput(InputStream stream, XHTMLContentHandler
>> >xhtml) throws SAXException, IOException {
>> >+
>> >+    Reader reader = new InputStreamReader(stream, "UTF-8");
>> >+    xhtml.startDocument();
>> >+    xhtml.startElement("div");
>> >+    try {
>> >+      char[] buffer = new char[1024];
>> >+      for (int n = reader.read(buffer); n != -1; n =
>> >reader.read(buffer)) {
>> >+        if (n > 0)
>> >+          xhtml.characters(buffer, 0, n);
>> >+      }
>> >+    } finally {
>> >+      reader.close();
>> >     }
>> >+    xhtml.endElement("div");
>> >+    xhtml.endDocument();
>> >+  }
>> >+
>> >+  /**
>> >+   * Starts a thread that reads the contents of the standard output or
>> >error
>> >+   * stream of the given process to not block the process. The stream
>>is
>> >closed
>> >+   * once fully processed.
>> >+   */
>> >+  private void logStream(final String logType, final InputStream
>>stream,
>> >final File file) {
>> >+    new Thread() {
>> >+      public void run() {
>> >+        Reader reader = new InputStreamReader(stream);
>> >+        StringBuilder out = new StringBuilder();
>> >+        char[] buffer = new char[1024];
>> >+        try {
>> >+          for (int n = reader.read(buffer); n != -1; n =
>> >reader.read(buffer))
>> >+            out.append(buffer, 0, n);
>> >+        } catch (IOException e) {
>> >
>> >-
>> >-}
>> >+        } finally {
>> >+          IOUtils.closeQuietly(stream);
>> >+        }
>> >
>> >+        String msg = out.toString();
>> >+        // log or discard message?
>> >
>> >+      }
>> >+    }.start();
>> >+  }
>> >+
>> >+  private List<Parser> getImageParsers() {
>> >+    List<Parser> parsers = new ArrayList<Parser>();
>> >+    parsers.add(new ImageParser());
>> >+    parsers.add(new PSDParser());
>> >+    parsers.add(new TiffParser());
>> >+    parsers.add(new JpegParser());
>> >+    return parsers;
>> >+  }
>> >+
>> >+}
>> >
>> >Modified:
>> 
>>>tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822
>>>Pa
>> >rserTest.java
>> >URL:
>> >
>> 
>>http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/ap
>>a
>> 
>>>che/tika/parser/mail/RFC822ParserTest.java?rev=1633325&r1=1633324&r2=163
>>>33
>> >25&view=diff
>> 
>>>========================================================================
>>>==
>> >====
>> >---
>> 
>>>tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822
>>>Pa
>> >rserTest.java (original)
>> >+++
>> 
>>>tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822
>>>Pa
>> >rserTest.java Tue Oct 21 09:32:06 2014
>> >@@ -36,6 +36,8 @@ import org.apache.tika.metadata.Metadata
>> > import org.apache.tika.metadata.TikaCoreProperties;
>> > import org.apache.tika.parser.ParseContext;
>> > import org.apache.tika.parser.Parser;
>> >+import org.apache.tika.parser.ocr.TesseractOCRConfig;
>> >+import org.apache.tika.parser.ocr.TesseractOCRParserTest;
>> > import org.apache.tika.sax.BodyContentHandler;
>> > import org.apache.tika.sax.XHTMLContentHandler;
>> > import org.junit.Test;
>> >@@ -83,13 +85,19 @@ public class RFC822ParserTest {
>> >         try {
>> >             parser.parse(stream, handler, metadata, new
>>ParseContext());
>> >             verify(handler).startDocument();
>> >-            //4 body-part divs -- two outer bodies and two inner
>>bodies
>> >-            verify(handler,
>> >times(4)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"),
>> >eq("div"), any(Attributes.class));
>> >-            verify(handler,
>> >times(4)).endElement(XHTMLContentHandler.XHTML, "div", "div");
>> >-            //5 paragraph elements, 4 for body-parts and 1 for
>> >encompassing message
>> >-            verify(handler,
>> >times(5)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"),
>> >any(Attributes.class));
>> >-            verify(handler,
>> >times(5)).endElement(XHTMLContentHandler.XHTML, "p", "p");
>> >+            int bodyExpectedTimes = 4, multipackExpectedTimes = 5;;
>> >+            int invokingTimes = bodyExpectedTimes;
>> >+            TesseractOCRConfig config = new TesseractOCRConfig();
>> >+            if (TesseractOCRParserTest.canRun(config)) {
>> >+              invokingTimes = multipackExpectedTimes;
>> >+            }
>> >+
>> >+            verify(handler,
>> >times(invokingTimes)).startElement(eq(XHTMLContentHandler.XHTML),
>> >eq("div"), eq("div"), any(Attributes.class));
>> >+            verify(handler,
>> >times(invokingTimes)).endElement(XHTMLContentHandler.XHTML, "div",
>>"div");
>> >+            verify(handler,
>> 
>>>times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML
>>>),
>> > eq("p"), eq("p"), any(Attributes.class));
>> >+            verify(handler,
>> >times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML,
>>"p",
>> >"p");
>> >             verify(handler).endDocument();
>> >+
>> >         } catch (Exception e) {
>> >             fail("Exception thrown: " + e.getMessage());
>> >         }
>> >
>> >
>>
>>
>
>
>-- 
>--------------
>Hong-Thai

Re: svn commit: r1633325 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java test/java/org/apache/tika/parser/mail/RFC822ParserTest.java

Posted by Hong-Thai Nguyen <th...@gmail.com>.

Hi Chris,

Yes, I made a mistake on this commit by missing a renaming file and broke
build, the next commit corrected:
Revision: 1633331
Author: thaichat04
Date: mardi 21 octobre 2014 11:47:54
Message:
TIKA-1422 - Fixing build & minor refactory of naming test class
----
Modified :
/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
Added :
/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
Deleted :
/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java

Please 'pull' latest again then tell me if OK ?

Sorry

On Tue, Oct 21, 2014 at 3:49 PM, Mattmann, Chris A (3980) <
chris.a.mattmann@jpl.nasa.gov> wrote:

> Hi Hong-Thai,
>
> These commits look strange to me - it looks like it subtracts the
> whole files (and the unit test removed the test file, renamed it,
> and then added what largely looks like the same file, back?)
>
> Any idea what¹s up?
>
> Cheers,
> Chris
>
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> Chris Mattmann, Ph.D.
> Chief Architect
> Instrument Software and Science Data Systems Section (398)
> NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA
> Office: 168-519, Mailstop: 168-527
> Email: chris.a.mattmann@nasa.gov
> WWW:  http://sunset.usc.edu/~mattmann/
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> Adjunct Associate Professor, Computer Science Department
> University of Southern California, Los Angeles, CA 90089 USA
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>
>
>
>
>
>
> -----Original Message-----
> From: "thaichat04@apache.org" <th...@apache.org>
> Reply-To: "dev@tika.apache.org" <de...@tika.apache.org>
> Date: Tuesday, October 21, 2014 at 2:32 AM
> To: "commits@tika.apache.org" <co...@tika.apache.org>
> Subject: svn commit: r1633325 - in /tika/trunk/tika-parsers/src:
> main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
> test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
>
> >Author: thaichat04
> >Date: Tue Oct 21 09:32:06 2014
> >New Revision: 1633325
> >
> >URL: http://svn.apache.org/r1633325
> >Log:
> >TIKA-1422 - Apply fix of [~olegt] in Windows
> >
> >Modified:
> >
> >tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract
> >OCRParser.java
> >
> >tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822Pa
> >rserTest.java
> >
> >Modified:
> >tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract
> >OCRParser.java
> >URL:
> >
> http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apa
> >che/tika/parser/ocr/TesseractOCRParser.java?rev=1633325&r1=1633324&r2=1633
> >325&view=diff
> >==========================================================================
> >====
> >---
> >tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract
> >OCRParser.java (original)
> >+++
> >tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract
> >OCRParser.java Tue Oct 21 09:32:06 2014
> >@@ -26,11 +26,11 @@ import java.io.IOException;
> > import java.io.InputStream;
> > import java.io.InputStreamReader;
> > import java.io.Reader;
> >+import java.util.ArrayList;
> > import java.util.HashSet;
> >+import java.util.List;
> > import java.util.Map;
> > import java.util.Set;
> >-import java.util.List;
> >-import java.util.ArrayList;
> > import java.util.concurrent.Callable;
> > import java.util.concurrent.ExecutionException;
> > import java.util.concurrent.FutureTask;
> >@@ -45,20 +45,23 @@ import org.apache.tika.io.TemporaryResou
> > import org.apache.tika.io.TikaInputStream;
> > import org.apache.tika.metadata.Metadata;
> > import org.apache.tika.mime.MediaType;
> >-import org.apache.tika.parser.Parser;
> > import org.apache.tika.parser.AbstractParser;
> > import org.apache.tika.parser.ParseContext;
> >+import org.apache.tika.parser.Parser;
> > import org.apache.tika.parser.external.ExternalParser;
> >+import org.apache.tika.parser.image.ImageParser;
> >+import org.apache.tika.parser.image.PSDParser;
> >+import org.apache.tika.parser.image.TiffParser;
> >+import org.apache.tika.parser.jpeg.JpegParser;
> > import org.apache.tika.sax.XHTMLContentHandler;
> > import org.xml.sax.ContentHandler;
> > import org.xml.sax.SAXException;
> >
> > /**
> >- * TesseractOCRParser powered by tesseract-ocr engine.
> >- * To enable this parser, create a {@link TesseractOCRConfig}
> >- * object and pass it through a ParseContext.
> >- * Tesseract-ocr must be installed and on system path or
> >- * the path to its root folder must be provided:
> >+ * TesseractOCRParser powered by tesseract-ocr engine. To enable this
> >parser,
> >+ * create a {@link TesseractOCRConfig} object and pass it through a
> >+ * ParseContext. Tesseract-ocr must be installed and on system path or
> >the path
> >+ * to its root folder must be provided:
> >  * <p>
> >  * TesseractOCRConfig config = new TesseractOCRConfig();<br>
> >  * //Needed if tesseract is not on system path<br>
> >@@ -69,226 +72,231 @@ import org.xml.sax.SAXException;
> >  *
> >  */
> > public class TesseractOCRParser extends AbstractParser {
> >-
> >-      private static final long serialVersionUID = 1L;
> >-
> >-      private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
> >-
> >-      private static Set<MediaType> getTypes() {
> >-              HashSet<MediaType> supportedTypes = new
> HashSet<MediaType>();
> >-
> >-              supportedTypes.add(MediaType.image("png"));
> >-              supportedTypes.add(MediaType.image("jpeg"));
> >-              supportedTypes.add(MediaType.image("tiff"));
> >-              supportedTypes.add(MediaType.image("x-ms-bmp"));
> >-              supportedTypes.add(MediaType.image("gif"));
> >-
> >-              return supportedTypes;
> >-      }
> >-
> >-      @Override
> >-      public Set<MediaType> getSupportedTypes(ParseContext arg0) {
> >-              return SUPPORTED_TYPES;
> >-      }
> >-
> >-    private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
> >-        if(!config.getTesseractPath().isEmpty()){
> >-            Map<String, String> env = pb.environment();
> >-            env.put("TESSDATA_PREFIX", config.getTesseractPath());
> >-        }
> >+
> >+  private static final long serialVersionUID = 1L;
> >+
> >+  private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
> >+
> >+  private static Set<MediaType> getTypes() {
> >+    HashSet<MediaType> supportedTypes = new HashSet<MediaType>();
> >+
> >+    supportedTypes.add(MediaType.image("png"));
> >+    supportedTypes.add(MediaType.image("jpeg"));
> >+    supportedTypes.add(MediaType.image("tiff"));
> >+    supportedTypes.add(MediaType.image("x-ms-bmp"));
> >+    supportedTypes.add(MediaType.image("gif"));
> >+
> >+    return supportedTypes;
> >+  }
> >+
> >+  @Override
> >+  public Set<MediaType> getSupportedTypes(ParseContext arg0) {
> >+    return SUPPORTED_TYPES;
> >+  }
> >+
> >+  private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
> >+    if (!config.getTesseractPath().isEmpty()) {
> >+      Map<String, String> env = pb.environment();
> >+      env.put("TESSDATA_PREFIX", config.getTesseractPath());
> >     }
> >-
> >-      public void parse(Image image, ContentHandler handler, Metadata
> >metadata, ParseContext context)
> >-            throws IOException, SAXException, TikaException {
> >-
> >-              TemporaryResources tmp = new TemporaryResources();
> >-              FileOutputStream fos = null;
> >-              TikaInputStream tis = null;
> >-              try{
> >-                      int w = image.getWidth(null);
> >-              int h = image.getHeight(null);
> >-              BufferedImage bImage = new BufferedImage(w, h,
> >BufferedImage.TYPE_INT_RGB);
> >-              Graphics2D g2 = bImage.createGraphics();
> >-              g2.drawImage(image, 0, 0, null);
> >-              g2.dispose();
> >-              File file = tmp.createTemporaryFile();
> >-                      fos = new FileOutputStream(file);
> >-                      ImageIO.write(bImage, "png", fos);
> >-                      bImage = null;
> >-                      tis = TikaInputStream.get(file);
> >-                      parse(tis, handler, metadata, context);
> >-
> >-              }finally{
> >-                      tmp.dispose();
> >-                      if(tis != null)
> >-                              tis.close();
> >-                      if(fos != null)
> >-                              fos.close();
> >-              }
> >-
> >-
> >-      }
> >-
> >-      @Override
> >-    public void parse(
> >-            InputStream stream, ContentHandler handler,
> >-            Metadata metadata, ParseContext context)
> >-            throws IOException, SAXException, TikaException {
> >-
> >-      TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
> >-      if(config == null) config = new TesseractOCRConfig();
> >-
> >-        String[] checkCmd = {config.getTesseractPath() + "tesseract"};
> >-        // If Tesseract is not on the path, do not try to run OCR.
> >-        if (!ExternalParser.check(checkCmd)) return;
> >-
> >-      XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
> >metadata);
> >+  }
> >
> >-        TemporaryResources tmp = new TemporaryResources();
> >-        File output = null;
> >-        try {
> >-              TikaInputStream  tikaStream = TikaInputStream.get(stream,
> tmp);
> >-              File input = tikaStream.getFile();
> >-              long size = tikaStream.getLength();
> >-
> >-              if(size >= config.getMinFileSizeToOcr() && size <=
> >config.getMaxFileSizeToOcr()){
> >-
> >-              output = tmp.createTemporaryFile();
> >-              doOCR(input, output, config);
> >-
> >-                //Tesseract appends .txt to output file name
> >-                output = new File(output.getAbsolutePath() + ".txt");
> >-
> >-                if(output.exists())
> >-                      extractOutput(new FileInputStream(output), xhtml);
> >+  public void parse(Image image, ContentHandler handler, Metadata
> >metadata, ParseContext context) throws IOException,
> >+      SAXException, TikaException {
> >
> >-              }
> >-
> >-        } finally {
> >-              tmp.dispose();
> >-              if(output != null)
> >-                      output.delete();
> >-
> >-        }
> >+    TemporaryResources tmp = new TemporaryResources();
> >+    FileOutputStream fos = null;
> >+    TikaInputStream tis = null;
> >+    try {
> >+      int w = image.getWidth(null);
> >+      int h = image.getHeight(null);
> >+      BufferedImage bImage = new BufferedImage(w, h,
> >BufferedImage.TYPE_INT_RGB);
> >+      Graphics2D g2 = bImage.createGraphics();
> >+      g2.drawImage(image, 0, 0, null);
> >+      g2.dispose();
> >+      File file = tmp.createTemporaryFile();
> >+      fos = new FileOutputStream(file);
> >+      ImageIO.write(bImage, "png", fos);
> >+      bImage = null;
> >+      tis = TikaInputStream.get(file);
> >+      parse(tis, handler, metadata, context);
> >+
> >+    } finally {
> >+      tmp.dispose();
> >+      if (tis != null)
> >+        tis.close();
> >+      if (fos != null)
> >+        fos.close();
> >     }
> >
> >-      /**
> >-       * Run external tesseract-ocr process.
> >-       * @param input File to be ocred
> >-     * @param output File to collect ocr result
> >-     * @param config Configuration of tesseract-ocr engine
> >-     * @throws TikaException if the extraction timed out
> >-     * @throws IOException if an input error occurred
> >-       */
> >-    private void doOCR(File input, File output, TesseractOCRConfig
> >config)
> >-            throws IOException, TikaException {
> >-        String[] cmd = {config.getTesseractPath() + "tesseract",
> >-                                      input.getPath(),
> >-                                              output.getPath() ,
> >-                                              "-l",
> >-                                              config.getLanguage() ,
> >-                                              "-psm",
> >-                                              config.getPageSegMode() };
> >-
> >-        ProcessBuilder pb = new ProcessBuilder(cmd);
> >-        setEnv(config, pb);
> >-        final Process process = pb.start();
> >-
> >-        process.getOutputStream().close();
> >-        InputStream out = process.getInputStream();
> >-        InputStream err = process.getErrorStream();
> >-
> >-        logStream("OCR MSG", out, input);
> >-        logStream("OCR ERROR", err, input);
> >-
> >-        FutureTask<Integer> waitTask = new FutureTask<Integer>(new
> >Callable<Integer>() {
> >-              public Integer call() throws Exception {
> >-                  return process.waitFor();
> >-              }
> >-        });
> >-
> >-        Thread waitThread = new Thread(waitTask);
> >-        waitThread.start();
> >-
> >-        try {
> >-              waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
> >-
> >-        } catch (InterruptedException e) {
> >-              waitThread.interrupt();
> >-              process.destroy();
> >-              Thread.currentThread().interrupt();
> >-              throw new TikaException("TesseractOCRParser interrupted",
> e);
> >-
> >-        } catch (ExecutionException e) {
> >-                      //should not be thrown
> >-
> >-              } catch (TimeoutException e) {
> >-                      waitThread.interrupt();
> >-                      process.destroy();
> >-                      throw new TikaException("TesseractOCRParser
> timeout", e);
> >-              }
> >-
> >-
> >+  }
> >+
> >+  @Override
> >+  public void parse(InputStream stream, ContentHandler handler, Metadata
> >metadata, ParseContext context)
> >+      throws IOException, SAXException, TikaException {
> >+
> >+    TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
> >+    if (config == null)
> >+      config = new TesseractOCRConfig();
> >+
> >+    String[] checkCmd = { config.getTesseractPath() + "tesseract" };
> >+    // If Tesseract is not on the path, do not try to run OCR.
> >+    if (!ExternalParser.check(checkCmd))
> >+      return;
> >+
> >+    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
> >metadata);
> >+
> >+    TemporaryResources tmp = new TemporaryResources();
> >+    File output = null;
> >+    try {
> >+      TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
> >+      File input = tikaStream.getFile();
> >+      long size = tikaStream.getLength();
> >+
> >+      if (size >= config.getMinFileSizeToOcr() && size <=
> >config.getMaxFileSizeToOcr()) {
> >+
> >+        output = tmp.createTemporaryFile();
> >+        doOCR(input, output, config);
> >+
> >+        // Tesseract appends .txt to output file name
> >+        output = new File(output.getAbsolutePath() + ".txt");
> >+
> >+        if (output.exists())
> >+          extractOutput(new FileInputStream(output), xhtml);
> >+
> >+      }
> >+
> >+    } finally {
> >+      tmp.dispose();
> >+      if (output != null)
> >+        output.delete();
> >+
> >     }
> >-
> >+  }
> >
> >-    /**
> >-     * Reads the contents of the given stream and write it to the
> >-     * given XHTML content handler.
> >-     * The stream is closed once fully processed.
> >-     *
> >-     * @param stream Stream where is the result of ocr
> >-     * @param xhtml XHTML content handler
> >-     * @throws SAXException if the XHTML SAX events could not be handled
> >-     * @throws IOException if an input error occurred
> >-     */
> >-    private void extractOutput(InputStream stream, XHTMLContentHandler
> >xhtml)
> >-      throws SAXException, IOException {
> >-
> >-        Reader reader = new InputStreamReader(stream, "UTF-8");
> >-        xhtml.startDocument();
> >-        xhtml.startElement("div");
> >-        try {
> >-            char[] buffer = new char[1024];
> >-            for (int n = reader.read(buffer); n != -1; n =
> >reader.read(buffer)) {
> >-                if (n > 0) xhtml.characters(buffer, 0, n);
> >-            }
> >-        } finally {
> >-            reader.close();
> >-        }
> >-        xhtml.endElement("div");
> >-        xhtml.endDocument();
> >+  /**
> >+   * Run external tesseract-ocr process.
> >+   *
> >+   * @param input
> >+   *          File to be ocred
> >+   * @param output
> >+   *          File to collect ocr result
> >+   * @param config
> >+   *          Configuration of tesseract-ocr engine
> >+   * @throws TikaException
> >+   *           if the extraction timed out
> >+   * @throws IOException
> >+   *           if an input error occurred
> >+   */
> >+  private void doOCR(File input, File output, TesseractOCRConfig config)
> >throws IOException, TikaException {
> >+    String[] cmd = { config.getTesseractPath() + "tesseract",
> >input.getPath(), output.getPath(), "-l",
> >+        config.getLanguage(), "-psm", config.getPageSegMode() };
> >+
> >+    ProcessBuilder pb = new ProcessBuilder(cmd);
> >+    setEnv(config, pb);
> >+    final Process process = pb.start();
> >+
> >+    process.getOutputStream().close();
> >+    InputStream out = process.getInputStream();
> >+    InputStream err = process.getErrorStream();
> >+
> >+    logStream("OCR MSG", out, input);
> >+    logStream("OCR ERROR", err, input);
> >+
> >+    FutureTask<Integer> waitTask = new FutureTask<Integer>(new
> >Callable<Integer>() {
> >+      public Integer call() throws Exception {
> >+        return process.waitFor();
> >+      }
> >+    });
> >+
> >+    Thread waitThread = new Thread(waitTask);
> >+    waitThread.start();
> >+
> >+    try {
> >+      waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
> >+
> >+    } catch (InterruptedException e) {
> >+      waitThread.interrupt();
> >+      process.destroy();
> >+      Thread.currentThread().interrupt();
> >+      throw new TikaException("TesseractOCRParser interrupted", e);
> >+
> >+    } catch (ExecutionException e) {
> >+      // should not be thrown
> >+
> >+    } catch (TimeoutException e) {
> >+      waitThread.interrupt();
> >+      process.destroy();
> >+      throw new TikaException("TesseractOCRParser timeout", e);
> >     }
> >
> >-    /**
> >-     * Starts a thread that reads the contents of the standard output
> >-     * or error stream of the given process to not block the process.
> >-     * The stream is closed once fully processed.
> >-     */
> >-    private void logStream(final String logType, final InputStream
> >stream, final File file) {
> >-        new Thread() {
> >-            public void run() {
> >-              Reader reader = new InputStreamReader(stream);
> >-                StringBuilder out = new StringBuilder();
> >-                char[] buffer = new char[1024];
> >-                try {
> >-                                      for (int n = reader.read(buffer);
> n != -1; n = reader.read(buffer))
> >-                                              out.append(buffer, 0, n);
> >-                              } catch (IOException e) {
> >-
> >-                              } finally {
> >-                    IOUtils.closeQuietly(stream);
> >-                }
> >-
> >-
> >-                              String msg = out.toString();
> >-                              //log or discard message?
> >-
> >-            }
> >-        }.start();
> >+  }
> >+
> >+  /**
> >+   * Reads the contents of the given stream and write it to the given
> >XHTML
> >+   * content handler. The stream is closed once fully processed.
> >+   *
> >+   * @param stream
> >+   *          Stream where is the result of ocr
> >+   * @param xhtml
> >+   *          XHTML content handler
> >+   * @throws SAXException
> >+   *           if the XHTML SAX events could not be handled
> >+   * @throws IOException
> >+   *           if an input error occurred
> >+   */
> >+  private void extractOutput(InputStream stream, XHTMLContentHandler
> >xhtml) throws SAXException, IOException {
> >+
> >+    Reader reader = new InputStreamReader(stream, "UTF-8");
> >+    xhtml.startDocument();
> >+    xhtml.startElement("div");
> >+    try {
> >+      char[] buffer = new char[1024];
> >+      for (int n = reader.read(buffer); n != -1; n =
> >reader.read(buffer)) {
> >+        if (n > 0)
> >+          xhtml.characters(buffer, 0, n);
> >+      }
> >+    } finally {
> >+      reader.close();
> >     }
> >+    xhtml.endElement("div");
> >+    xhtml.endDocument();
> >+  }
> >+
> >+  /**
> >+   * Starts a thread that reads the contents of the standard output or
> >error
> >+   * stream of the given process to not block the process. The stream is
> >closed
> >+   * once fully processed.
> >+   */
> >+  private void logStream(final String logType, final InputStream stream,
> >final File file) {
> >+    new Thread() {
> >+      public void run() {
> >+        Reader reader = new InputStreamReader(stream);
> >+        StringBuilder out = new StringBuilder();
> >+        char[] buffer = new char[1024];
> >+        try {
> >+          for (int n = reader.read(buffer); n != -1; n =
> >reader.read(buffer))
> >+            out.append(buffer, 0, n);
> >+        } catch (IOException e) {
> >
> >-
> >-}
> >+        } finally {
> >+          IOUtils.closeQuietly(stream);
> >+        }
> >
> >+        String msg = out.toString();
> >+        // log or discard message?
> >
> >+      }
> >+    }.start();
> >+  }
> >+
> >+  private List<Parser> getImageParsers() {
> >+    List<Parser> parsers = new ArrayList<Parser>();
> >+    parsers.add(new ImageParser());
> >+    parsers.add(new PSDParser());
> >+    parsers.add(new TiffParser());
> >+    parsers.add(new JpegParser());
> >+    return parsers;
> >+  }
> >+
> >+}
> >
> >Modified:
> >tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822Pa
> >rserTest.java
> >URL:
> >
> http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apa
> >che/tika/parser/mail/RFC822ParserTest.java?rev=1633325&r1=1633324&r2=16333
> >25&view=diff
> >==========================================================================
> >====
> >---
> >tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822Pa
> >rserTest.java (original)
> >+++
> >tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822Pa
> >rserTest.java Tue Oct 21 09:32:06 2014
> >@@ -36,6 +36,8 @@ import org.apache.tika.metadata.Metadata
> > import org.apache.tika.metadata.TikaCoreProperties;
> > import org.apache.tika.parser.ParseContext;
> > import org.apache.tika.parser.Parser;
> >+import org.apache.tika.parser.ocr.TesseractOCRConfig;
> >+import org.apache.tika.parser.ocr.TesseractOCRParserTest;
> > import org.apache.tika.sax.BodyContentHandler;
> > import org.apache.tika.sax.XHTMLContentHandler;
> > import org.junit.Test;
> >@@ -83,13 +85,19 @@ public class RFC822ParserTest {
> >         try {
> >             parser.parse(stream, handler, metadata, new ParseContext());
> >             verify(handler).startDocument();
> >-            //4 body-part divs -- two outer bodies and two inner bodies
> >-            verify(handler,
> >times(4)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"),
> >eq("div"), any(Attributes.class));
> >-            verify(handler,
> >times(4)).endElement(XHTMLContentHandler.XHTML, "div", "div");
> >-            //5 paragraph elements, 4 for body-parts and 1 for
> >encompassing message
> >-            verify(handler,
> >times(5)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"),
> >any(Attributes.class));
> >-            verify(handler,
> >times(5)).endElement(XHTMLContentHandler.XHTML, "p", "p");
> >+            int bodyExpectedTimes = 4, multipackExpectedTimes = 5;;
> >+            int invokingTimes = bodyExpectedTimes;
> >+            TesseractOCRConfig config = new TesseractOCRConfig();
> >+            if (TesseractOCRParserTest.canRun(config)) {
> >+              invokingTimes = multipackExpectedTimes;
> >+            }
> >+
> >+            verify(handler,
> >times(invokingTimes)).startElement(eq(XHTMLContentHandler.XHTML),
> >eq("div"), eq("div"), any(Attributes.class));
> >+            verify(handler,
> >times(invokingTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div");
> >+            verify(handler,
> >times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML),
> > eq("p"), eq("p"), any(Attributes.class));
> >+            verify(handler,
> >times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "p",
> >"p");
> >             verify(handler).endDocument();
> >+
> >         } catch (Exception e) {
> >             fail("Exception thrown: " + e.getMessage());
> >         }
> >
> >
>
>


-- 
--------------
Hong-Thai

Re: svn commit: r1633325 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java test/java/org/apache/tika/parser/mail/RFC822ParserTest.java

Posted by "Mattmann, Chris A (3980)" <ch...@jpl.nasa.gov>.

Hi Hong-Thai,

These commits look strange to me - it looks like it subtracts the
whole files (and the unit test removed the test file, renamed it,
and then added what largely looks like the same file, back?)

Any idea what¹s up?

Cheers,
Chris

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Chris Mattmann, Ph.D.
Chief Architect
Instrument Software and Science Data Systems Section (398)
NASA Jet Propulsion Laboratory Pasadena, CA 91109 USA
Office: 168-519, Mailstop: 168-527
Email: chris.a.mattmann@nasa.gov
WWW:  http://sunset.usc.edu/~mattmann/
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adjunct Associate Professor, Computer Science Department
University of Southern California, Los Angeles, CA 90089 USA
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++






-----Original Message-----
From: "thaichat04@apache.org" <th...@apache.org>
Reply-To: "dev@tika.apache.org" <de...@tika.apache.org>
Date: Tuesday, October 21, 2014 at 2:32 AM
To: "commits@tika.apache.org" <co...@tika.apache.org>
Subject: svn commit: r1633325 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
test/java/org/apache/tika/parser/mail/RFC822ParserTest.java

>Author: thaichat04
>Date: Tue Oct 21 09:32:06 2014
>New Revision: 1633325
>
>URL: http://svn.apache.org/r1633325
>Log:
>TIKA-1422 - Apply fix of [~olegt] in Windows
>
>Modified:
>    
>tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract
>OCRParser.java
>    
>tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822Pa
>rserTest.java
>
>Modified: 
>tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract
>OCRParser.java
>URL: 
>http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apa
>che/tika/parser/ocr/TesseractOCRParser.java?rev=1633325&r1=1633324&r2=1633
>325&view=diff
>==========================================================================
>====
>--- 
>tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract
>OCRParser.java (original)
>+++ 
>tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/Tesseract
>OCRParser.java Tue Oct 21 09:32:06 2014
>@@ -26,11 +26,11 @@ import java.io.IOException;
> import java.io.InputStream;
> import java.io.InputStreamReader;
> import java.io.Reader;
>+import java.util.ArrayList;
> import java.util.HashSet;
>+import java.util.List;
> import java.util.Map;
> import java.util.Set;
>-import java.util.List;
>-import java.util.ArrayList;
> import java.util.concurrent.Callable;
> import java.util.concurrent.ExecutionException;
> import java.util.concurrent.FutureTask;
>@@ -45,20 +45,23 @@ import org.apache.tika.io.TemporaryResou
> import org.apache.tika.io.TikaInputStream;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.mime.MediaType;
>-import org.apache.tika.parser.Parser;
> import org.apache.tika.parser.AbstractParser;
> import org.apache.tika.parser.ParseContext;
>+import org.apache.tika.parser.Parser;
> import org.apache.tika.parser.external.ExternalParser;
>+import org.apache.tika.parser.image.ImageParser;
>+import org.apache.tika.parser.image.PSDParser;
>+import org.apache.tika.parser.image.TiffParser;
>+import org.apache.tika.parser.jpeg.JpegParser;
> import org.apache.tika.sax.XHTMLContentHandler;
> import org.xml.sax.ContentHandler;
> import org.xml.sax.SAXException;
> 
> /**
>- * TesseractOCRParser powered by tesseract-ocr engine.
>- * To enable this parser, create a {@link TesseractOCRConfig}
>- * object and pass it through a ParseContext.
>- * Tesseract-ocr must be installed and on system path or
>- * the path to its root folder must be provided:
>+ * TesseractOCRParser powered by tesseract-ocr engine. To enable this
>parser,
>+ * create a {@link TesseractOCRConfig} object and pass it through a
>+ * ParseContext. Tesseract-ocr must be installed and on system path or
>the path
>+ * to its root folder must be provided:
>  * <p>
>  * TesseractOCRConfig config = new TesseractOCRConfig();<br>
>  * //Needed if tesseract is not on system path<br>
>@@ -69,226 +72,231 @@ import org.xml.sax.SAXException;
>  * 
>  */
> public class TesseractOCRParser extends AbstractParser {
>-	
>-	private static final long serialVersionUID = 1L;
>-	
>-	private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
>-	
>-	private static Set<MediaType> getTypes() {
>-		HashSet<MediaType> supportedTypes = new HashSet<MediaType>();
>-		
>-		supportedTypes.add(MediaType.image("png"));
>-		supportedTypes.add(MediaType.image("jpeg"));
>-		supportedTypes.add(MediaType.image("tiff"));
>-		supportedTypes.add(MediaType.image("x-ms-bmp"));
>-		supportedTypes.add(MediaType.image("gif"));
>-		
>-		return supportedTypes;
>-	}
>-	
>-	@Override
>-	public Set<MediaType> getSupportedTypes(ParseContext arg0) {
>-		return SUPPORTED_TYPES;
>-	}
>-
>-    private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
>-        if(!config.getTesseractPath().isEmpty()){
>-            Map<String, String> env = pb.environment();
>-            env.put("TESSDATA_PREFIX", config.getTesseractPath());
>-        }
>+
>+  private static final long serialVersionUID = 1L;
>+
>+  private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
>+
>+  private static Set<MediaType> getTypes() {
>+    HashSet<MediaType> supportedTypes = new HashSet<MediaType>();
>+
>+    supportedTypes.add(MediaType.image("png"));
>+    supportedTypes.add(MediaType.image("jpeg"));
>+    supportedTypes.add(MediaType.image("tiff"));
>+    supportedTypes.add(MediaType.image("x-ms-bmp"));
>+    supportedTypes.add(MediaType.image("gif"));
>+
>+    return supportedTypes;
>+  }
>+
>+  @Override
>+  public Set<MediaType> getSupportedTypes(ParseContext arg0) {
>+    return SUPPORTED_TYPES;
>+  }
>+
>+  private void setEnv(TesseractOCRConfig config, ProcessBuilder pb) {
>+    if (!config.getTesseractPath().isEmpty()) {
>+      Map<String, String> env = pb.environment();
>+      env.put("TESSDATA_PREFIX", config.getTesseractPath());
>     }
>-	
>-	public void parse(Image image, ContentHandler handler, Metadata
>metadata, ParseContext context)
>-            throws IOException, SAXException, TikaException {
>-
>-		TemporaryResources tmp = new TemporaryResources();
>-		FileOutputStream fos = null;
>-		TikaInputStream tis = null;
>-		try{
>-			int w = image.getWidth(null);
>-	        int h = image.getHeight(null);
>-	        BufferedImage bImage = new BufferedImage(w, h,
>BufferedImage.TYPE_INT_RGB);
>-	        Graphics2D g2 = bImage.createGraphics();
>-	        g2.drawImage(image, 0, 0, null);
>-	        g2.dispose();
>-	        File file = tmp.createTemporaryFile();
>-			fos = new FileOutputStream(file);
>-			ImageIO.write(bImage, "png", fos);
>-			bImage = null;
>-			tis = TikaInputStream.get(file);
>-			parse(tis, handler, metadata, context);
>-			
>-		}finally{
>-			tmp.dispose();
>-			if(tis != null)
>-				tis.close();
>-			if(fos != null)
>-				fos.close();
>-		}
>-		
>-		
>-	}
>-
>-	@Override
>-    public void parse(
>-            InputStream stream, ContentHandler handler,
>-            Metadata metadata, ParseContext context)
>-            throws IOException, SAXException, TikaException {
>-
>-    	TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
>-    	if(config == null) config = new TesseractOCRConfig();
>-
>-        String[] checkCmd = {config.getTesseractPath() + "tesseract"};
>-        // If Tesseract is not on the path, do not try to run OCR.
>-        if (!ExternalParser.check(checkCmd)) return;
>-    	
>-    	XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
>metadata);
>+  }
> 
>-        TemporaryResources tmp = new TemporaryResources();
>-        File output = null;
>-        try {
>-        	TikaInputStream  tikaStream = TikaInputStream.get(stream, tmp);
>-        	File input = tikaStream.getFile();
>-        	long size = tikaStream.getLength();
>-        	
>-        	if(size >= config.getMinFileSizeToOcr() && size <=
>config.getMaxFileSizeToOcr()){
>-        		
>-            	output = tmp.createTemporaryFile();
>-            	doOCR(input, output, config);
>-            	
>-                //Tesseract appends .txt to output file name
>-                output = new File(output.getAbsolutePath() + ".txt");
>-                
>-                if(output.exists())
>-                	extractOutput(new FileInputStream(output), xhtml);
>+  public void parse(Image image, ContentHandler handler, Metadata
>metadata, ParseContext context) throws IOException,
>+      SAXException, TikaException {
> 
>-        	}
>-        
>-        } finally {
>-        	tmp.dispose();
>-        	if(output != null)
>-        		output.delete();
>-            
>-        }
>+    TemporaryResources tmp = new TemporaryResources();
>+    FileOutputStream fos = null;
>+    TikaInputStream tis = null;
>+    try {
>+      int w = image.getWidth(null);
>+      int h = image.getHeight(null);
>+      BufferedImage bImage = new BufferedImage(w, h,
>BufferedImage.TYPE_INT_RGB);
>+      Graphics2D g2 = bImage.createGraphics();
>+      g2.drawImage(image, 0, 0, null);
>+      g2.dispose();
>+      File file = tmp.createTemporaryFile();
>+      fos = new FileOutputStream(file);
>+      ImageIO.write(bImage, "png", fos);
>+      bImage = null;
>+      tis = TikaInputStream.get(file);
>+      parse(tis, handler, metadata, context);
>+
>+    } finally {
>+      tmp.dispose();
>+      if (tis != null)
>+        tis.close();
>+      if (fos != null)
>+        fos.close();
>     }
> 
>-	/**
>-	 * Run external tesseract-ocr process.
>-	 * @param input File to be ocred
>-     * @param output File to collect ocr result
>-     * @param config Configuration of tesseract-ocr engine
>-     * @throws TikaException if the extraction timed out
>-     * @throws IOException if an input error occurred
>-	 */
>-    private void doOCR(File input, File output, TesseractOCRConfig
>config)
>-            throws IOException, TikaException {
>-        String[] cmd = {config.getTesseractPath() + "tesseract",
>-    					input.getPath(),
>-						output.getPath() ,
>-						"-l", 
>-						config.getLanguage() ,
>-						"-psm", 
>-						config.getPageSegMode()	};
>-            
>-        ProcessBuilder pb = new ProcessBuilder(cmd);
>-        setEnv(config, pb);
>-        final Process process = pb.start();
>-            
>-        process.getOutputStream().close();
>-        InputStream out = process.getInputStream();
>-        InputStream err = process.getErrorStream();
>-            
>-        logStream("OCR MSG", out, input);
>-        logStream("OCR ERROR", err, input);
>-           
>-        FutureTask<Integer> waitTask = new FutureTask<Integer>(new
>Callable<Integer>() {
>-        	public Integer call() throws Exception {
>-          	    return process.waitFor();
>-          	}
>-        });
>-
>-        Thread waitThread = new Thread(waitTask);
>-        waitThread.start();
>-          
>-        try {
>-        	waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
>-              
>-        } catch (InterruptedException e) {
>-        	waitThread.interrupt();
>-          	process.destroy();
>-          	Thread.currentThread().interrupt();
>-          	throw new TikaException("TesseractOCRParser interrupted", e);
>-          	
>-        } catch (ExecutionException e) {
>-			//should not be thrown
>-				
>-		} catch (TimeoutException e) {
>-			waitThread.interrupt();
>-			process.destroy();
>-			throw new TikaException("TesseractOCRParser timeout", e);
>-		}
>-            	
>-            
>+  }
>+
>+  @Override
>+  public void parse(InputStream stream, ContentHandler handler, Metadata
>metadata, ParseContext context)
>+      throws IOException, SAXException, TikaException {
>+
>+    TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
>+    if (config == null)
>+      config = new TesseractOCRConfig();
>+
>+    String[] checkCmd = { config.getTesseractPath() + "tesseract" };
>+    // If Tesseract is not on the path, do not try to run OCR.
>+    if (!ExternalParser.check(checkCmd))
>+      return;
>+
>+    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
>metadata);
>+
>+    TemporaryResources tmp = new TemporaryResources();
>+    File output = null;
>+    try {
>+      TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
>+      File input = tikaStream.getFile();
>+      long size = tikaStream.getLength();
>+
>+      if (size >= config.getMinFileSizeToOcr() && size <=
>config.getMaxFileSizeToOcr()) {
>+
>+        output = tmp.createTemporaryFile();
>+        doOCR(input, output, config);
>+
>+        // Tesseract appends .txt to output file name
>+        output = new File(output.getAbsolutePath() + ".txt");
>+
>+        if (output.exists())
>+          extractOutput(new FileInputStream(output), xhtml);
>+
>+      }
>+
>+    } finally {
>+      tmp.dispose();
>+      if (output != null)
>+        output.delete();
>+
>     }
>-    
>+  }
> 
>-    /**
>-     * Reads the contents of the given stream and write it to the
>-     * given XHTML content handler.
>-     * The stream is closed once fully processed.
>-     *
>-     * @param stream Stream where is the result of ocr
>-     * @param xhtml XHTML content handler
>-     * @throws SAXException if the XHTML SAX events could not be handled
>-     * @throws IOException if an input error occurred
>-     */
>-    private void extractOutput(InputStream stream, XHTMLContentHandler
>xhtml)
>-	throws SAXException, IOException {
>- 
>-        Reader reader = new InputStreamReader(stream, "UTF-8");
>-        xhtml.startDocument();
>-        xhtml.startElement("div");
>-        try {
>-            char[] buffer = new char[1024];
>-            for (int n = reader.read(buffer); n != -1; n =
>reader.read(buffer)) {
>-                if (n > 0) xhtml.characters(buffer, 0, n);
>-            }
>-        } finally {
>-            reader.close();
>-        }
>-        xhtml.endElement("div");
>-        xhtml.endDocument();
>+  /**
>+   * Run external tesseract-ocr process.
>+   * 
>+   * @param input
>+   *          File to be ocred
>+   * @param output
>+   *          File to collect ocr result
>+   * @param config
>+   *          Configuration of tesseract-ocr engine
>+   * @throws TikaException
>+   *           if the extraction timed out
>+   * @throws IOException
>+   *           if an input error occurred
>+   */
>+  private void doOCR(File input, File output, TesseractOCRConfig config)
>throws IOException, TikaException {
>+    String[] cmd = { config.getTesseractPath() + "tesseract",
>input.getPath(), output.getPath(), "-l",
>+        config.getLanguage(), "-psm", config.getPageSegMode() };
>+
>+    ProcessBuilder pb = new ProcessBuilder(cmd);
>+    setEnv(config, pb);
>+    final Process process = pb.start();
>+
>+    process.getOutputStream().close();
>+    InputStream out = process.getInputStream();
>+    InputStream err = process.getErrorStream();
>+
>+    logStream("OCR MSG", out, input);
>+    logStream("OCR ERROR", err, input);
>+
>+    FutureTask<Integer> waitTask = new FutureTask<Integer>(new
>Callable<Integer>() {
>+      public Integer call() throws Exception {
>+        return process.waitFor();
>+      }
>+    });
>+
>+    Thread waitThread = new Thread(waitTask);
>+    waitThread.start();
>+
>+    try {
>+      waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
>+
>+    } catch (InterruptedException e) {
>+      waitThread.interrupt();
>+      process.destroy();
>+      Thread.currentThread().interrupt();
>+      throw new TikaException("TesseractOCRParser interrupted", e);
>+
>+    } catch (ExecutionException e) {
>+      // should not be thrown
>+
>+    } catch (TimeoutException e) {
>+      waitThread.interrupt();
>+      process.destroy();
>+      throw new TikaException("TesseractOCRParser timeout", e);
>     }
> 
>-    /**
>-     * Starts a thread that reads the contents of the standard output
>-     * or error stream of the given process to not block the process.
>-     * The stream is closed once fully processed.
>-     */
>-    private void logStream(final String logType, final InputStream
>stream, final File file) {
>-        new Thread() {
>-            public void run() {
>-            	Reader reader = new InputStreamReader(stream);
>-                StringBuilder out = new StringBuilder();
>-                char[] buffer = new char[1024];
>-                try {
>-					for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
>-						out.append(buffer, 0, n);
>-				} catch (IOException e) {
>-					
>-				} finally {
>-                    IOUtils.closeQuietly(stream);
>-                }
>-			
>-				
>-				String msg = out.toString();
>-				//log or discard message?
>-				
>-            }
>-        }.start();
>+  }
>+
>+  /**
>+   * Reads the contents of the given stream and write it to the given
>XHTML
>+   * content handler. The stream is closed once fully processed.
>+   * 
>+   * @param stream
>+   *          Stream where is the result of ocr
>+   * @param xhtml
>+   *          XHTML content handler
>+   * @throws SAXException
>+   *           if the XHTML SAX events could not be handled
>+   * @throws IOException
>+   *           if an input error occurred
>+   */
>+  private void extractOutput(InputStream stream, XHTMLContentHandler
>xhtml) throws SAXException, IOException {
>+
>+    Reader reader = new InputStreamReader(stream, "UTF-8");
>+    xhtml.startDocument();
>+    xhtml.startElement("div");
>+    try {
>+      char[] buffer = new char[1024];
>+      for (int n = reader.read(buffer); n != -1; n =
>reader.read(buffer)) {
>+        if (n > 0)
>+          xhtml.characters(buffer, 0, n);
>+      }
>+    } finally {
>+      reader.close();
>     }
>+    xhtml.endElement("div");
>+    xhtml.endDocument();
>+  }
>+
>+  /**
>+   * Starts a thread that reads the contents of the standard output or
>error
>+   * stream of the given process to not block the process. The stream is
>closed
>+   * once fully processed.
>+   */
>+  private void logStream(final String logType, final InputStream stream,
>final File file) {
>+    new Thread() {
>+      public void run() {
>+        Reader reader = new InputStreamReader(stream);
>+        StringBuilder out = new StringBuilder();
>+        char[] buffer = new char[1024];
>+        try {
>+          for (int n = reader.read(buffer); n != -1; n =
>reader.read(buffer))
>+            out.append(buffer, 0, n);
>+        } catch (IOException e) {
> 
>-	
>-}
>+        } finally {
>+          IOUtils.closeQuietly(stream);
>+        }
> 
>+        String msg = out.toString();
>+        // log or discard message?
> 
>+      }
>+    }.start();
>+  }
>+
>+  private List<Parser> getImageParsers() {
>+    List<Parser> parsers = new ArrayList<Parser>();
>+    parsers.add(new ImageParser());
>+    parsers.add(new PSDParser());
>+    parsers.add(new TiffParser());
>+    parsers.add(new JpegParser());
>+    return parsers;
>+  }
>+
>+}
>
>Modified: 
>tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822Pa
>rserTest.java
>URL: 
>http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apa
>che/tika/parser/mail/RFC822ParserTest.java?rev=1633325&r1=1633324&r2=16333
>25&view=diff
>==========================================================================
>====
>--- 
>tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822Pa
>rserTest.java (original)
>+++ 
>tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822Pa
>rserTest.java Tue Oct 21 09:32:06 2014
>@@ -36,6 +36,8 @@ import org.apache.tika.metadata.Metadata
> import org.apache.tika.metadata.TikaCoreProperties;
> import org.apache.tika.parser.ParseContext;
> import org.apache.tika.parser.Parser;
>+import org.apache.tika.parser.ocr.TesseractOCRConfig;
>+import org.apache.tika.parser.ocr.TesseractOCRParserTest;
> import org.apache.tika.sax.BodyContentHandler;
> import org.apache.tika.sax.XHTMLContentHandler;
> import org.junit.Test;
>@@ -83,13 +85,19 @@ public class RFC822ParserTest {
>         try {
>             parser.parse(stream, handler, metadata, new ParseContext());
>             verify(handler).startDocument();
>-            //4 body-part divs -- two outer bodies and two inner bodies
>-            verify(handler,
>times(4)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"),
>eq("div"), any(Attributes.class));
>-            verify(handler,
>times(4)).endElement(XHTMLContentHandler.XHTML, "div", "div");
>-            //5 paragraph elements, 4 for body-parts and 1 for
>encompassing message
>-            verify(handler,
>times(5)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"),
>any(Attributes.class));
>-            verify(handler,
>times(5)).endElement(XHTMLContentHandler.XHTML, "p", "p");
>+            int bodyExpectedTimes = 4, multipackExpectedTimes = 5;;
>+            int invokingTimes = bodyExpectedTimes;
>+            TesseractOCRConfig config = new TesseractOCRConfig();
>+            if (TesseractOCRParserTest.canRun(config)) {
>+              invokingTimes = multipackExpectedTimes;
>+            }
>+            
>+            verify(handler,
>times(invokingTimes)).startElement(eq(XHTMLContentHandler.XHTML),
>eq("div"), eq("div"), any(Attributes.class));
>+            verify(handler,
>times(invokingTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div");
>+            verify(handler,
>times(multipackExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML),
> eq("p"), eq("p"), any(Attributes.class));
>+            verify(handler,
>times(multipackExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "p",
>"p");
>             verify(handler).endDocument();
>+            
>         } catch (Exception e) {
>             fail("Exception thrown: " + e.getMessage());
>         }
>
>