You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2014/10/12 18:30:37 UTC
svn commit: r1631206 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java
Author: mattmann
Date: Sun Oct 12 16:30:37 2014
New Revision: 1631206
URL: http://svn.apache.org/r1631206
Log:
Fix for TIKA-1422 contributed by tpalsulich and mattmann.
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1631206&r1=1631205&r2=1631206&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Sun Oct 12 16:30:37 2014
@@ -29,6 +29,8 @@ import java.io.Reader;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
+import java.util.List;
+import java.util.ArrayList;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.FutureTask;
@@ -43,6 +45,7 @@ import org.apache.tika.io.TemporaryResou
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.Parser;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
@@ -97,7 +100,7 @@ public class TesseractOCRParser extends
public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
-
+
TemporaryResources tmp = new TemporaryResources();
FileOutputStream fos = null;
TikaInputStream tis = null;
@@ -131,6 +134,7 @@ public class TesseractOCRParser extends
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
+
TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
if(config == null) config = new TesseractOCRConfig();
@@ -139,8 +143,7 @@ public class TesseractOCRParser extends
if (!ExternalParser.check(checkCmd)) return;
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
+
TemporaryResources tmp = new TemporaryResources();
File output = null;
try {
@@ -167,7 +170,6 @@ public class TesseractOCRParser extends
output.delete();
}
- xhtml.endDocument();
}
/**
@@ -241,19 +243,21 @@ public class TesseractOCRParser extends
* @throws IOException if an input error occurred
*/
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
- throws SAXException, IOException {
-
+ throws SAXException, IOException {
+
Reader reader = new InputStreamReader(stream, "UTF-8");
+ xhtml.startDocument();
+ xhtml.startElement("div");
try {
- xhtml.startElement("div");
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
- xhtml.characters(buffer, 0, n);
+ if (n > 0) xhtml.characters(buffer, 0, n);
}
- xhtml.endElement("div");
} finally {
reader.close();
}
+ xhtml.endElement("div");
+ xhtml.endDocument();
}
/**
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java?rev=1631206&r1=1631205&r2=1631206&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java Sun Oct 12 16:30:37 2014
@@ -33,6 +33,12 @@ import static org.junit.Assume.assumeTru
public class TesseractOCRTest extends TikaTest {
+ public static boolean canRun() {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ TesseractOCRTest tesseractOCRTest = new TesseractOCRTest();
+ return tesseractOCRTest.canRun(config);
+ }
+
private boolean canRun(TesseractOCRConfig config) {
String[] checkCmd = {config.getTesseractPath() + "tesseract"};
// If Tesseract is not on the path, do not run the test.