You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/09/23 01:13:08 UTC
[1/2] tika git commit: add hOCR output format to TesseractParser
Repository: tika
Updated Branches:
refs/heads/master 8a45f67a2 -> 3a5431e20
add hOCR output format to TesseractParser
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/10507d05
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/10507d05
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/10507d05
Branch: refs/heads/master
Commit: 10507d0521a0f06c50f32aa6150228ef4ac773d4
Parents: 8a45f67
Author: Eric Pugh <ep...@o19s.com>
Authored: Thu Sep 22 13:14:55 2016 -0400
Committer: Eric Pugh <ep...@o19s.com>
Committed: Thu Sep 22 13:14:55 2016 -0400
----------------------------------------------------------------------
.../tika/parser/ocr/TesseractOCRConfig.java | 18 +++++++++++
.../tika/parser/ocr/TesseractOCRParser.java | 6 ++--
.../parser/ocr/TesseractOCRConfig.properties | 1 +
.../tika/parser/ocr/TesseractOCRParserTest.java | 34 +++++++++++++++-----
4 files changed, 48 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/10507d05/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 84312d8..7b266f1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -62,6 +62,9 @@ public class TesseractOCRConfig implements Serializable{
// Maximum time (seconds) to wait for the ocring process termination
private int timeout = 120;
+
+ // The format of the ocr'ed output to be returned, txt or hocr.
+ private String outputType = "txt";
// enable image processing (optional)
private int enableImageProcessing = 0;
@@ -135,6 +138,8 @@ public class TesseractOCRConfig implements Serializable{
getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr()));
setTimeout(
getProp(props, "timeout", getTimeout()));
+ setOutputType(
+ getProp(props, "outputType", getOutputType()));
// set parameters for ImageMagick
setEnableImageProcessing(
@@ -261,6 +266,19 @@ public class TesseractOCRConfig implements Serializable{
public int getTimeout() {
return timeout;
}
+
+ /**
+ * Set output type from ocr process. Default is "txt", but can be "hocr".
+ * Default value is 120s.
+ */
+ public void setOutputType(String outputType) {
+ this.outputType = outputType;
+ }
+
+ /** @see #setOutputType(String outputType) */
+ public String getOutputType() {
+ return outputType;
+ }
/** @see #setEnableImageProcessing(boolean)
* @return image processing is enabled or not */
http://git-wip-us.apache.org/repos/asf/tika/blob/10507d05/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index e0f0d2b..ccf21cb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -328,8 +328,8 @@ public class TesseractOCRParser extends AbstractParser {
doOCR(tmpFile, tmpImgFile, config);
- // Tesseract appends .txt to output file name
- tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + ".txt");
+ // Tesseract appends the output type (.txt or .hocr) to output file name
+ tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + "." + config.getOutputType());
if (tmpTxtOutput.exists()) {
try (InputStream is = new FileInputStream(tmpTxtOutput)) {
@@ -375,7 +375,7 @@ public class TesseractOCRParser extends AbstractParser {
*/
private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
- config.getLanguage(), "-psm", config.getPageSegMode() };
+ config.getLanguage(), "-psm", config.getPageSegMode(), config.getOutputType()};
ProcessBuilder pb = new ProcessBuilder(cmd);
setEnv(config, pb);
http://git-wip-us.apache.org/repos/asf/tika/blob/10507d05/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
index 7acc694..2380282 100644
--- a/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
@@ -20,6 +20,7 @@ pageSegMode=1
maxFileSizeToOcr=2147483647
minFileSizeToOcr=0
timeout=120
+outputType=txt
# properties for image processing
# to enable processing, set enableImageProcessing to 1
http://git-wip-us.apache.org/repos/asf/tika/blob/10507d05/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index cc0288f..4490953 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -126,9 +126,31 @@ public class TesseractOCRParserTest extends TikaTest {
};
testBasicOCR(resource, nonOCRContains, 3);
}
+
+ @Test
+ public void testOCROutputsHOCR() throws Exception {
+ String resource = "/test-documents/testOCR.pdf";
+ String[] nonOCRContains = new String[0];
+ String contents = runOCR(resource, nonOCRContains, 2, "hocr");
+ assertTrue(contents.contains("<meta name='ocr-system' content='tesseract"));
+
+ }
- private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception {
+ private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception{
+ String contents = runOCR(resource, nonOCRContains, numMetadatas, "txt");
+ if (canRun()) {
+ if(resource.substring(resource.lastIndexOf('.'), resource.length()).equals(".jpg")) {
+ assertTrue(contents.toString().contains("Apache"));
+ } else {
+ assertTrue(contents.toString().contains("Happy New Year 2003!"));
+ }
+ }
+ }
+
+ private String runOCR(String resource, String[] nonOCRContains, int numMetadatas, String outputType) throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
+ config.setOutputType(outputType);
+
Parser parser = new RecursiveParserWrapper(new AutoDetectParser(),
new BasicContentHandlerFactory(
BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
@@ -151,13 +173,7 @@ public class TesseractOCRParserTest extends TikaTest {
for (Metadata m : metadataList) {
contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
}
- if (canRun()) {
- if(resource.substring(resource.lastIndexOf('.'), resource.length()).equals(".jpg")) {
- assertTrue(contents.toString().contains("Apache"));
- } else {
- assertTrue(contents.toString().contains("Happy New Year 2003!"));
- }
- }
+
for (String needle : nonOCRContains) {
assertContains(needle, contents.toString());
}
@@ -165,6 +181,8 @@ public class TesseractOCRParserTest extends TikaTest {
assertTrue(metadataList.get(1).names().length > 10);
//test at least one value
assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
+
+ return contents.toString();
}
@Test
[2/2] tika git commit: TIKA-2093 -- add option for Tesseract's hOCR
output, thanks to Eric Pugh! This closes #133.
Posted by ta...@apache.org.
TIKA-2093 -- add option for Tesseract's hOCR output, thanks to Eric Pugh! This closes #133.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3a5431e2
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3a5431e2
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3a5431e2
Branch: refs/heads/master
Commit: 3a5431e200056d85b458bea766fd185225771c97
Parents: 10507d0
Author: tballison <ta...@mitre.org>
Authored: Thu Sep 22 21:12:44 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Thu Sep 22 21:12:44 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 3 +
.../tika/parser/ocr/TesseractOCRConfig.java | 27 +++--
.../tika/parser/ocr/TesseractOCRParser.java | 117 ++++++++++++++++---
.../tika/parser/ocr/TesseractOCRParserTest.java | 23 ++--
4 files changed, 140 insertions(+), 30 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/3a5431e2/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 9a03b01..ef82775 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.14 - ???
+ * Add Tesseract's hOCR output format as an option, via Eric Pugh
+ (TIKA-2093)
+
* Extract macros from MSOffice files (TIKA-2069).
* Maintain passed-in mime in TXTParser (TIKA-2047).
http://git-wip-us.apache.org/repos/asf/tika/blob/3a5431e2/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 7b266f1..7d6cd3f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -42,6 +42,11 @@ public class TesseractOCRConfig implements Serializable{
private static final long serialVersionUID = -4861942486845757891L;
+ public enum OUTPUT_TYPE {
+ TXT,
+ HOCR
+ }
+
// Path to tesseract installation folder, if not on system path.
private String tesseractPath = "";
@@ -64,7 +69,7 @@ public class TesseractOCRConfig implements Serializable{
private int timeout = 120;
// The format of the ocr'ed output to be returned, txt or hocr.
- private String outputType = "txt";
+ private OUTPUT_TYPE outputType = OUTPUT_TYPE.TXT;
// enable image processing (optional)
private int enableImageProcessing = 0;
@@ -138,9 +143,13 @@ public class TesseractOCRConfig implements Serializable{
getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr()));
setTimeout(
getProp(props, "timeout", getTimeout()));
- setOutputType(
- getProp(props, "outputType", getOutputType()));
-
+ String outputTypeString = props.getProperty("outputType");
+ if ("txt".equals(outputTypeString)) {
+ setOutputType(OUTPUT_TYPE.TXT);
+ } else if ("hocr".equals(outputTypeString)) {
+ setOutputType(OUTPUT_TYPE.HOCR);
+ }
+
// set parameters for ImageMagick
setEnableImageProcessing(
getProp(props, "enableImageProcessing", isEnableImageProcessing()));
@@ -271,16 +280,16 @@ public class TesseractOCRConfig implements Serializable{
* Set output type from ocr process. Default is "txt", but can be "hocr".
* Default value is 120s.
*/
- public void setOutputType(String outputType) {
+ public void setOutputType(OUTPUT_TYPE outputType) {
this.outputType = outputType;
}
- /** @see #setOutputType(String outputType) */
- public String getOutputType() {
+ /** @see #setOutputType(OUTPUT_TYPE outputType) */
+ public OUTPUT_TYPE getOutputType() {
return outputType;
}
- /** @see #setEnableImageProcessing(boolean)
+ /** @see #setEnableImageProcessing(int)
* @return image processing is enabled or not */
public int isEnableImageProcessing() {
return enableImageProcessing;
@@ -411,7 +420,7 @@ public class TesseractOCRConfig implements Serializable{
/**
* Set the path to the ImageMagick executable, needed if it is not on system path.
- * @param path to ImageMagick file.
+ * @param ImageMagickPath to ImageMagick file.
*/
public void setImageMagickPath(String ImageMagickPath) {
if(!ImageMagickPath.isEmpty() && !ImageMagickPath.endsWith(File.separator))
http://git-wip-us.apache.org/repos/asf/tika/blob/3a5431e2/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index ccf21cb..36c831b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -16,8 +16,10 @@
*/
package org.apache.tika.parser.ocr;
-import javax.imageio.ImageIO;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import javax.imageio.ImageIO;
+import javax.xml.parsers.SAXParser;
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
@@ -36,6 +38,7 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
@@ -65,11 +68,12 @@ import org.apache.tika.parser.image.ImageParser;
import org.apache.tika.parser.image.TiffParser;
import org.apache.tika.parser.jpeg.JpegParser;
import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
+import org.xml.sax.helpers.DefaultHandler;
/**
* TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
@@ -95,6 +99,8 @@ public class TesseractOCRParser extends AbstractParser {
})));
private static Map<String,Boolean> TESSERACT_PRESENT = new HashMap<String, Boolean>();
+
+
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
// If Tesseract is installed, offer our supported image types
@@ -127,7 +133,6 @@ public class TesseractOCRParser extends AbstractParser {
if (TESSERACT_PRESENT.containsKey(tesseract)) {
return TESSERACT_PRESENT.get(tesseract);
}
-
// Try running Tesseract from there, and see if it exists + works
String[] checkCmd = { tesseract };
boolean hasTesseract = ExternalParser.check(checkCmd);
@@ -199,9 +204,10 @@ public class TesseractOCRParser extends AbstractParser {
}
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext)
throws IOException, SAXException, TikaException {
- TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
+
+ TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
// If Tesseract is not on the path with the current config, do not try to run OCR
// getSupportedTypes shouldn't have listed us as handling it, so this should only
// occur if someone directly calls this parser, not via DefaultParser or similar
@@ -215,12 +221,12 @@ public class TesseractOCRParser extends AbstractParser {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
File tmpImgFile = tmp.createTemporaryFile();
- parse(tikaStream, tmpImgFile, xhtml, config);
+ parse(tikaStream, tmpImgFile, parseContext, xhtml, config);
// Temporary workaround for TIKA-1445 - until we can specify
// composite parsers with strategies (eg Composite, Try In Turn),
// always send the image onwards to the regular parser to have
// the metadata for them extracted as well
- _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new EmbeddedContentHandler(xhtml), metadata, context);
+ _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new EmbeddedContentHandler(xhtml), metadata, parseContext);
xhtml.endDocument();
} finally {
tmp.dispose();
@@ -230,15 +236,37 @@ public class TesseractOCRParser extends AbstractParser {
/**
* Use this to parse content without starting a new document.
* This appends SAX events to xhtml without re-adding the metadata, body start, etc.
+ *
* @param stream inputstream
* @param xhtml handler
* @param config TesseractOCRConfig to use for this parse
* @throws IOException
* @throws SAXException
* @throws TikaException
+ *
+ * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, ParseContext, TesseractOCRConfig)}
*/
public void parseInline(InputStream stream, XHTMLContentHandler xhtml, TesseractOCRConfig config)
throws IOException, SAXException, TikaException {
+ parseInline(stream, xhtml, new ParseContext(), config);
+ }
+
+ /**
+ * Use this to parse content without starting a new document.
+ * This appends SAX events to xhtml without re-adding the metadata, body start, etc.
+ *
+ * @param stream inputstream
+ * @param xhtml handler
+ * @param config TesseractOCRConfig to use for this parse
+ * @throws IOException
+ * @throws SAXException
+ * @throws TikaException
+ *
+ * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, ParseContext, TesseractOCRConfig)}
+ */
+ public void parseInline(InputStream stream, XHTMLContentHandler xhtml, ParseContext parseContext,
+ TesseractOCRConfig config)
+ throws IOException, SAXException, TikaException {
// If Tesseract is not on the path with the current config, do not try to run OCR
// getSupportedTypes shouldn't have listed us as handling it, so this should only
// occur if someone directly calls this parser, not via DefaultParser or similar
@@ -249,7 +277,7 @@ public class TesseractOCRParser extends AbstractParser {
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
File tmpImgFile = tmp.createTemporaryFile();
- parse(tikaStream, tmpImgFile, xhtml, config);
+ parse(tikaStream, tmpImgFile, parseContext, xhtml, config);
} finally {
tmp.dispose();
}
@@ -305,10 +333,10 @@ public class TesseractOCRParser extends AbstractParser {
tmp.close();
}
- private void parse(TikaInputStream tikaInputStream, File tmpImgFile, XHTMLContentHandler xhtml, TesseractOCRConfig config)
+ private void parse(TikaInputStream tikaInputStream, File tmpImgFile, ParseContext parseContext,
+ XHTMLContentHandler xhtml, TesseractOCRConfig config)
throws IOException, SAXException, TikaException {
File tmpTxtOutput = null;
-
try {
File input = tikaInputStream.getFile();
long size = tikaInputStream.getLength();
@@ -333,7 +361,11 @@ public class TesseractOCRParser extends AbstractParser {
if (tmpTxtOutput.exists()) {
try (InputStream is = new FileInputStream(tmpTxtOutput)) {
- extractOutput(is, xhtml);
+ if (config.getOutputType().equals(TesseractOCRConfig.OUTPUT_TYPE.HOCR)) {
+ extractHOCROutput(is, parseContext, xhtml);
+ } else {
+ extractOutput(is, xhtml);
+ }
}
}
@@ -347,6 +379,7 @@ public class TesseractOCRParser extends AbstractParser {
}
}
+
// TIKA-1445 workaround parser
private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
private static class CompositeImageParser extends CompositeParser {
@@ -375,7 +408,7 @@ public class TesseractOCRParser extends AbstractParser {
*/
private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
- config.getLanguage(), "-psm", config.getPageSegMode(), config.getOutputType()};
+ config.getLanguage(), "-psm", config.getPageSegMode(), config.getOutputType().name().toLowerCase(Locale.US)};
ProcessBuilder pb = new ProcessBuilder(cmd);
setEnv(config, pb);
@@ -441,7 +474,17 @@ public class TesseractOCRParser extends AbstractParser {
}
}
xhtml.endElement("div");
+ }
+ private void extractHOCROutput(InputStream is, ParseContext parseContext,
+ XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException {
+ if (parseContext == null) {
+ parseContext = new ParseContext();
+ }
+ SAXParser parser = parseContext.getSAXParser();
+ xhtml.startElement("div", "class", "ocr");
+ parser.parse(is, new OfflineContentHandler(new HOCRPassThroughHandler(xhtml)));
+ xhtml.endElement("div");
}
/**
@@ -477,5 +520,53 @@ public class TesseractOCRParser extends AbstractParser {
static String getImageMagickProg() {
return System.getProperty("os.name").startsWith("Windows") ? "convert.exe" : "convert";
}
+
+
+ private static class HOCRPassThroughHandler extends DefaultHandler {
+ private final ContentHandler xhtml;
+ public static final Set<String> IGNORE = unmodifiableSet(
+ "html", "head", "title", "meta", "body");
+
+ public HOCRPassThroughHandler(ContentHandler xhtml) {
+ this.xhtml = xhtml;
+ }
+
+ /**
+ * Starts the given element. Table cells and list items are automatically
+ * indented by emitting a tab character as ignorable whitespace.
+ */
+ @Override
+ public void startElement(
+ String uri, String local, String name, Attributes attributes)
+ throws SAXException {
+ if (!IGNORE.contains(name)) {
+ xhtml.startElement(uri, local, name, attributes);
+ }
+ }
+
+ /**
+ * Ends the given element. Block elements are automatically followed
+ * by a newline character.
+ */
+ @Override
+ public void endElement(String uri, String local, String name) throws SAXException {
+ if (!IGNORE.contains(name)) {
+ xhtml.endElement(uri, local, name);
+ }
+ }
+
+ /**
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
+ */
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ xhtml.characters(ch, start, length);
+ }
+
+ private static Set<String> unmodifiableSet(String... elements) {
+ return Collections.unmodifiableSet(
+ new HashSet<String>(Arrays.asList(elements)));
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/3a5431e2/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 4490953..b81ded3 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -21,10 +21,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assume.assumeTrue;
-import java.io.BufferedReader;
-import java.io.File;
import java.io.InputStream;
-import java.io.InputStreamReader;
import java.util.List;
import org.apache.tika.TikaTest;
@@ -129,15 +126,23 @@ public class TesseractOCRParserTest extends TikaTest {
@Test
public void testOCROutputsHOCR() throws Exception {
+ assumeTrue(canRun());
+
String resource = "/test-documents/testOCR.pdf";
+
String[] nonOCRContains = new String[0];
- String contents = runOCR(resource, nonOCRContains, 2, "hocr");
- assertTrue(contents.contains("<meta name='ocr-system' content='tesseract"));
+ String contents = runOCR(resource, nonOCRContains, 2,
+ BasicContentHandlerFactory.HANDLER_TYPE.XML,
+ TesseractOCRConfig.OUTPUT_TYPE.HOCR);
+
+ assertContains("<span class=\"ocrx_word\" id=\"word_1_1\"", contents);
+ assertContains("Happy</span>", contents);
}
private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception{
- String contents = runOCR(resource, nonOCRContains, numMetadatas, "txt");
+ String contents = runOCR(resource, nonOCRContains, numMetadatas,
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, TesseractOCRConfig.OUTPUT_TYPE.TXT);
if (canRun()) {
if(resource.substring(resource.lastIndexOf('.'), resource.length()).equals(".jpg")) {
assertTrue(contents.toString().contains("Apache"));
@@ -147,13 +152,15 @@ public class TesseractOCRParserTest extends TikaTest {
}
}
- private String runOCR(String resource, String[] nonOCRContains, int numMetadatas, String outputType) throws Exception {
+ private String runOCR(String resource, String[] nonOCRContains, int numMetadatas,
+ BasicContentHandlerFactory.HANDLER_TYPE handlerType,
+ TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setOutputType(outputType);
Parser parser = new RecursiveParserWrapper(new AutoDetectParser(),
new BasicContentHandlerFactory(
- BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+ handlerType, -1));
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);