You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2014/10/12 18:30:37 UTC

svn commit: r1631206 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java

Author: mattmann
Date: Sun Oct 12 16:30:37 2014
New Revision: 1631206

URL: http://svn.apache.org/r1631206
Log:
Fix for TIKA-1422 contributed by tpalsulich and mattmann.

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1631206&r1=1631205&r2=1631206&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Sun Oct 12 16:30:37 2014
@@ -29,6 +29,8 @@ import java.io.Reader;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
+import java.util.List;
+import java.util.ArrayList;
 import java.util.concurrent.Callable;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.FutureTask;
@@ -43,6 +45,7 @@ import org.apache.tika.io.TemporaryResou
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.external.ExternalParser;
@@ -97,7 +100,7 @@ public class TesseractOCRParser extends 
 	
 	public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-		
+
 		TemporaryResources tmp = new TemporaryResources();
 		FileOutputStream fos = null;
 		TikaInputStream tis = null;
@@ -131,6 +134,7 @@ public class TesseractOCRParser extends 
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
+
     	TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
     	if(config == null) config = new TesseractOCRConfig();
 
@@ -139,8 +143,7 @@ public class TesseractOCRParser extends 
         if (!ExternalParser.check(checkCmd)) return;
     	
     	XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-    	xhtml.startDocument();
-    	
+
         TemporaryResources tmp = new TemporaryResources();
         File output = null;
         try {
@@ -167,7 +170,6 @@ public class TesseractOCRParser extends 
         		output.delete();
             
         }
-        xhtml.endDocument();
     }
 
 	/**
@@ -241,19 +243,21 @@ public class TesseractOCRParser extends 
      * @throws IOException if an input error occurred
      */
     private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
-            throws SAXException, IOException {
-    	
+	throws SAXException, IOException {
+ 
         Reader reader = new InputStreamReader(stream, "UTF-8");
+        xhtml.startDocument();
+        xhtml.startElement("div");
         try {
-            xhtml.startElement("div");
             char[] buffer = new char[1024];
             for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
-                xhtml.characters(buffer, 0, n);
+                if (n > 0) xhtml.characters(buffer, 0, n);
             }
-            xhtml.endElement("div");
         } finally {
             reader.close();
         }
+        xhtml.endElement("div");
+        xhtml.endDocument();
     }
 
     /**

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java?rev=1631206&r1=1631205&r2=1631206&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java Sun Oct 12 16:30:37 2014
@@ -33,6 +33,12 @@ import static org.junit.Assume.assumeTru
 
 public class TesseractOCRTest  extends TikaTest {
 
+    public static boolean canRun() {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        TesseractOCRTest tesseractOCRTest = new TesseractOCRTest();
+        return tesseractOCRTest.canRun(config);
+    }
+
     private boolean canRun(TesseractOCRConfig config) {
         String[] checkCmd = {config.getTesseractPath() + "tesseract"};
         // If Tesseract is not on the path, do not run the test.