You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/01/07 17:48:43 UTC

svn commit: r1650117 - /tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java

Author: tallison
Date: Wed Jan  7 16:48:43 2015
New Revision: 1650117

URL: http://svn.apache.org/r1650117
Log:
TIKA-1445: add tests to TesseractOCRParserTest to ensure metadata is extracted

Modified:
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java?rev=1650117&r1=1650116&r2=1650117&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java Wed Jan  7 16:48:43 2015
@@ -22,6 +22,7 @@ import static org.junit.Assert.assertTru
 import static org.junit.Assume.assumeTrue;
 
 import java.io.InputStream;
+import java.util.List;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
@@ -30,11 +31,14 @@ import org.apache.tika.parser.AutoDetect
 import org.apache.tika.parser.DefaultParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.external.ExternalParser;
 import org.apache.tika.parser.image.ImageParser;
 import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
 
 public class TesseractOCRParserTest extends TikaTest {
 
@@ -49,151 +53,148 @@ public class TesseractOCRParserTest exte
         // If Tesseract is not on the path, do not run the test.
         return ExternalParser.check(checkCmd);
     }
-    
+
     @Test
     public void offersNoTypesIfNotFound() throws Exception {
         TesseractOCRParser parser = new TesseractOCRParser();
         DefaultParser defaultParser = new DefaultParser();
         MediaType png = MediaType.image("png");
-        
+
         // With an invalid path, will offer no types
         TesseractOCRConfig invalidConfig = new TesseractOCRConfig();
         invalidConfig.setTesseractPath("/made/up/path");
-        
+
         ParseContext parseContext = new ParseContext();
         parseContext.set(TesseractOCRConfig.class, invalidConfig);
 
         // No types offered
         assertEquals(0, parser.getSupportedTypes(parseContext).size());
-        
+
         // And DefaultParser won't use us
         assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
-        
-        
+
+
         // With a correct path, with offer the usual types
         TesseractOCRConfig normalConfig = new TesseractOCRConfig();
         assumeTrue(canRun(normalConfig));
         parseContext.set(TesseractOCRConfig.class, normalConfig);
-        
+
         assertEquals(5, parser.getSupportedTypes(parseContext).size());
         assertTrue(parser.getSupportedTypes(parseContext).contains(png));
-        
+
         // DefaultParser now will
         assertEquals(TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
     }
 
     @Test
     public void testPDFOCR() throws Exception {
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        assumeTrue(canRun(config));
-
-        Parser parser = new AutoDetectParser();
-        BodyContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        PDFParserConfig pdfConfig = new PDFParserConfig();
-        pdfConfig.setExtractInlineImages(true);
-
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(TesseractOCRConfig.class, config);
-        parseContext.set(Parser.class, new TesseractOCRParser());
-        parseContext.set(PDFParserConfig.class, pdfConfig);
-
-        InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
-                "/test-documents/testOCR.pdf");
-
-        try {
-            parser.parse(stream, handler, metadata, parseContext);
-            assertContains("Happy New Year 2003!", handler.toString());
-        } finally {
-            stream.close();
-        }
+        String resource = "/test-documents/testOCR.pdf";
+        String[] nonOCRContains = new String[0];
+        testBasicOCR(resource, nonOCRContains, 2);
     }
 
     @Test
     public void testDOCXOCR() throws Exception {
-        TesseractOCRConfig config = new TesseractOCRConfig();
-        assumeTrue(canRun(config));
-
-        Parser parser = new AutoDetectParser();
-        BodyContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(TesseractOCRConfig.class, config);
-        parseContext.set(Parser.class, new TesseractOCRParser());
-
-        InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
-                "/test-documents/testOCR.docx");
-
-        try {
-            parser.parse(stream, handler, metadata, parseContext);
-
-            assertContains("Happy New Year 2003!", handler.toString());
-            assertContains("This is some text.", handler.toString());
-            assertContains("Here is an embedded image:", handler.toString());
-        } finally {
-            stream.close();
-        }
+        String resource = "/test-documents/testOCR.docx";
+        String[] nonOCRContains = {
+                "This is some text.",
+                "Here is an embedded image:"
+        };
+        testBasicOCR(resource, nonOCRContains, 3);
     }
 
     @Test
     public void testPPTXOCR() throws Exception {
+        String resource = "/test-documents/testOCR.pptx";
+        String[] nonOCRContains = {
+                "This is some text"
+        };
+        testBasicOCR(resource, nonOCRContains, 3);
+    }
+
+    private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception {
         TesseractOCRConfig config = new TesseractOCRConfig();
-        assumeTrue(canRun(config));
+        Parser parser = new RecursiveParserWrapper(new AutoDetectParser(),
+                new BasicContentHandlerFactory(
+                        BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
 
-        Parser parser = new AutoDetectParser();
-        BodyContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
+        PDFParserConfig pdfConfig = new PDFParserConfig();
+        pdfConfig.setExtractInlineImages(true);
 
         ParseContext parseContext = new ParseContext();
         parseContext.set(TesseractOCRConfig.class, config);
-        parseContext.set(Parser.class, new TesseractOCRParser());
+        parseContext.set(Parser.class, parser);
+        parseContext.set(PDFParserConfig.class, pdfConfig);
 
         InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
-                "/test-documents/testOCR.pptx");
+                resource);
 
         try {
-            parser.parse(stream, handler, metadata, parseContext);
-
-            assertTrue("Check for the image's text.", handler.toString().contains("Happy New Year 2003!"));
-            assertTrue("Check for the standard text.", handler.toString().contains("This is some text"));
+            parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
         } finally {
             stream.close();
         }
+        List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
+        assertEquals(numMetadatas, metadataList.size());
+
+        StringBuilder contents = new StringBuilder();
+        for (Metadata m : metadataList) {
+            contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
+        }
+        if (canRun()) {
+            assertTrue(contents.toString().contains("Happy New Year 2003!"));
+        }
+        for (String needle : nonOCRContains) {
+            assertContains(needle, contents.toString());
+        }
+        assertTrue(metadataList.get(0).names().length > 10);
+        assertTrue(metadataList.get(1).names().length > 10);
+        //test at least one value
+        assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
     }
-    
+
     @Test
-    public void getNormalMetadataToo() throws Exception {
+    public void testSingleImage() throws Exception {
         TesseractOCRConfig config = new TesseractOCRConfig();
         assumeTrue(canRun(config));
+        String xml = getXML("testOCR.jpg").xml;
+        assertContains("OCR Testing", xml);
+    }
 
-        Parser parser = new AutoDetectParser();
-        BodyContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        ParseContext parseContext = new ParseContext();
-        parseContext.set(TesseractOCRConfig.class, config);
-        parseContext.set(Parser.class, new TesseractOCRParser());
-
-        InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
-                "/test-documents/testOCR.jpg");
-
-        try {
-            parser.parse(stream, handler, metadata, parseContext);
-            
-            // OCR text
-            assertContains("Apache", handler.toString());
-            assertContains("OCR Testing", handler.toString());
-
-            // Core JPEG properties from JPEGParser should still come through
-            assertEquals("136", metadata.get(Metadata.IMAGE_WIDTH));
-            assertEquals("66", metadata.get(Metadata.IMAGE_LENGTH));
-            assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
-            assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
-            assertContains("This is a test Apache Tika imag", metadata.get(Metadata.COMMENTS));
-        } finally {
-            stream.close();
-        }
+    @Test
+    public void getNormalMetadataToo() throws Exception {
+        //this should be successful whether or not TesseractOCR is installed/active
+        //If tesseract is installed, the internal metadata extraction parser should
+        //work; and if tesseract isn't installed, the regular parsers should take over.
+
+        //gif
+        Metadata m = getXML("testGIF.gif").metadata;
+        assertTrue(m.names().length > 20);
+        assertEquals("RGB", m.get("Chroma ColorSpaceType"));
+
+        //jpg
+        m = getXML("testOCR.jpg").metadata;
+        assertEquals("136", m.get(Metadata.IMAGE_WIDTH));
+        assertEquals("66", m.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8", m.get(Metadata.BITS_PER_SAMPLE));
+        assertEquals(null, m.get(Metadata.SAMPLES_PER_PIXEL));
+        assertContains("This is a test Apache Tika imag", m.get(Metadata.COMMENTS));
+
+        //bmp
+        m = getXML("testBMP.bmp").metadata;
+        assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+
+        //png
+        m = getXML("testPNG.png").metadata;
+        assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+        assertEquals("UnsignedIntegral", m.get("Data SampleFormat"));
+
+        //tiff
+        m = getXML("testTIFF.tif").metadata;
+        assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+        assertEquals("72 dots per inch", m.get("Y Resolution"));
     }
 }