You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/01/07 17:48:43 UTC
svn commit: r1650117 -
/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
Author: tallison
Date: Wed Jan 7 16:48:43 2015
New Revision: 1650117
URL: http://svn.apache.org/r1650117
Log:
TIKA-1445: add tests to TesseractOCRParserTest to ensure metadata is extracted
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java?rev=1650117&r1=1650116&r2=1650117&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java Wed Jan 7 16:48:43 2015
@@ -22,6 +22,7 @@ import static org.junit.Assert.assertTru
import static org.junit.Assume.assumeTrue;
import java.io.InputStream;
+import java.util.List;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
@@ -30,11 +31,14 @@ import org.apache.tika.parser.AutoDetect
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.image.ImageParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
public class TesseractOCRParserTest extends TikaTest {
@@ -49,151 +53,148 @@ public class TesseractOCRParserTest exte
// If Tesseract is not on the path, do not run the test.
return ExternalParser.check(checkCmd);
}
-
+
@Test
public void offersNoTypesIfNotFound() throws Exception {
TesseractOCRParser parser = new TesseractOCRParser();
DefaultParser defaultParser = new DefaultParser();
MediaType png = MediaType.image("png");
-
+
// With an invalid path, will offer no types
TesseractOCRConfig invalidConfig = new TesseractOCRConfig();
invalidConfig.setTesseractPath("/made/up/path");
-
+
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, invalidConfig);
// No types offered
assertEquals(0, parser.getSupportedTypes(parseContext).size());
-
+
// And DefaultParser won't use us
assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
-
-
+
+
// With a correct path, with offer the usual types
TesseractOCRConfig normalConfig = new TesseractOCRConfig();
assumeTrue(canRun(normalConfig));
parseContext.set(TesseractOCRConfig.class, normalConfig);
-
+
assertEquals(5, parser.getSupportedTypes(parseContext).size());
assertTrue(parser.getSupportedTypes(parseContext).contains(png));
-
+
// DefaultParser now will
assertEquals(TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
}
@Test
public void testPDFOCR() throws Exception {
- TesseractOCRConfig config = new TesseractOCRConfig();
- assumeTrue(canRun(config));
-
- Parser parser = new AutoDetectParser();
- BodyContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- PDFParserConfig pdfConfig = new PDFParserConfig();
- pdfConfig.setExtractInlineImages(true);
-
- ParseContext parseContext = new ParseContext();
- parseContext.set(TesseractOCRConfig.class, config);
- parseContext.set(Parser.class, new TesseractOCRParser());
- parseContext.set(PDFParserConfig.class, pdfConfig);
-
- InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
- "/test-documents/testOCR.pdf");
-
- try {
- parser.parse(stream, handler, metadata, parseContext);
- assertContains("Happy New Year 2003!", handler.toString());
- } finally {
- stream.close();
- }
+ String resource = "/test-documents/testOCR.pdf";
+ String[] nonOCRContains = new String[0];
+ testBasicOCR(resource, nonOCRContains, 2);
}
@Test
public void testDOCXOCR() throws Exception {
- TesseractOCRConfig config = new TesseractOCRConfig();
- assumeTrue(canRun(config));
-
- Parser parser = new AutoDetectParser();
- BodyContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- ParseContext parseContext = new ParseContext();
- parseContext.set(TesseractOCRConfig.class, config);
- parseContext.set(Parser.class, new TesseractOCRParser());
-
- InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
- "/test-documents/testOCR.docx");
-
- try {
- parser.parse(stream, handler, metadata, parseContext);
-
- assertContains("Happy New Year 2003!", handler.toString());
- assertContains("This is some text.", handler.toString());
- assertContains("Here is an embedded image:", handler.toString());
- } finally {
- stream.close();
- }
+ String resource = "/test-documents/testOCR.docx";
+ String[] nonOCRContains = {
+ "This is some text.",
+ "Here is an embedded image:"
+ };
+ testBasicOCR(resource, nonOCRContains, 3);
}
@Test
public void testPPTXOCR() throws Exception {
+ String resource = "/test-documents/testOCR.pptx";
+ String[] nonOCRContains = {
+ "This is some text"
+ };
+ testBasicOCR(resource, nonOCRContains, 3);
+ }
+
+ private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
- assumeTrue(canRun(config));
+ Parser parser = new RecursiveParserWrapper(new AutoDetectParser(),
+ new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
- Parser parser = new AutoDetectParser();
- BodyContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
+ PDFParserConfig pdfConfig = new PDFParserConfig();
+ pdfConfig.setExtractInlineImages(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
- parseContext.set(Parser.class, new TesseractOCRParser());
+ parseContext.set(Parser.class, parser);
+ parseContext.set(PDFParserConfig.class, pdfConfig);
InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
- "/test-documents/testOCR.pptx");
+ resource);
try {
- parser.parse(stream, handler, metadata, parseContext);
-
- assertTrue("Check for the image's text.", handler.toString().contains("Happy New Year 2003!"));
- assertTrue("Check for the standard text.", handler.toString().contains("This is some text"));
+ parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
} finally {
stream.close();
}
+ List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
+ assertEquals(numMetadatas, metadataList.size());
+
+ StringBuilder contents = new StringBuilder();
+ for (Metadata m : metadataList) {
+ contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
+ }
+ if (canRun()) {
+ assertTrue(contents.toString().contains("Happy New Year 2003!"));
+ }
+ for (String needle : nonOCRContains) {
+ assertContains(needle, contents.toString());
+ }
+ assertTrue(metadataList.get(0).names().length > 10);
+ assertTrue(metadataList.get(1).names().length > 10);
+ //test at least one value
+ assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
}
-
+
@Test
- public void getNormalMetadataToo() throws Exception {
+ public void testSingleImage() throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
assumeTrue(canRun(config));
+ String xml = getXML("testOCR.jpg").xml;
+ assertContains("OCR Testing", xml);
+ }
- Parser parser = new AutoDetectParser();
- BodyContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- ParseContext parseContext = new ParseContext();
- parseContext.set(TesseractOCRConfig.class, config);
- parseContext.set(Parser.class, new TesseractOCRParser());
-
- InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
- "/test-documents/testOCR.jpg");
-
- try {
- parser.parse(stream, handler, metadata, parseContext);
-
- // OCR text
- assertContains("Apache", handler.toString());
- assertContains("OCR Testing", handler.toString());
-
- // Core JPEG properties from JPEGParser should still come through
- assertEquals("136", metadata.get(Metadata.IMAGE_WIDTH));
- assertEquals("66", metadata.get(Metadata.IMAGE_LENGTH));
- assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
- assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
- assertContains("This is a test Apache Tika imag", metadata.get(Metadata.COMMENTS));
- } finally {
- stream.close();
- }
+ @Test
+ public void getNormalMetadataToo() throws Exception {
+ //this should be successful whether or not TesseractOCR is installed/active
+ //If tesseract is installed, the internal metadata extraction parser should
+ //work; and if tesseract isn't installed, the regular parsers should take over.
+
+ //gif
+ Metadata m = getXML("testGIF.gif").metadata;
+ assertTrue(m.names().length > 20);
+ assertEquals("RGB", m.get("Chroma ColorSpaceType"));
+
+ //jpg
+ m = getXML("testOCR.jpg").metadata;
+ assertEquals("136", m.get(Metadata.IMAGE_WIDTH));
+ assertEquals("66", m.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8", m.get(Metadata.BITS_PER_SAMPLE));
+ assertEquals(null, m.get(Metadata.SAMPLES_PER_PIXEL));
+ assertContains("This is a test Apache Tika imag", m.get(Metadata.COMMENTS));
+
+ //bmp
+ m = getXML("testBMP.bmp").metadata;
+ assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+
+ //png
+ m = getXML("testPNG.png").metadata;
+ assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+ assertEquals("UnsignedIntegral", m.get("Data SampleFormat"));
+
+ //tiff
+ m = getXML("testTIFF.tif").metadata;
+ assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+ assertEquals("72 dots per inch", m.get("Y Resolution"));
}
}