You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/01/07 13:30:17 UTC

svn commit: r1650050 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java test/resources/test-documents/testOCR.jpg

Author: nick
Date: Wed Jan  7 12:30:17 2015
New Revision: 1650050

URL: http://svn.apache.org/r1650050
Log:
TIKA-1445 Unit test to check a JPEG via Tesseract gets both OCR text and normal JPEG metadata

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.jpg   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1650050&r1=1650049&r2=1650050&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Wed Jan  7 12:30:17 2015
@@ -192,6 +192,7 @@ public class TesseractOCRParser extends
   // TIKA-1445 workaround parser
   private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
   private static class CompositeImageParser extends CompositeParser {
+      private static final long serialVersionUID = -2398203346206381382L;
       private static List<Parser> imageParsers = Arrays.asList(new Parser[]{
           new ImageParser(), new JpegParser(), new TiffParser()
       });

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java?rev=1650050&r1=1650049&r2=1650050&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java Wed Jan  7 12:30:17 2015
@@ -25,6 +25,7 @@ import java.io.InputStream;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.DefaultParser;
@@ -32,7 +33,6 @@ import org.apache.tika.parser.ParseConte
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.external.ExternalParser;
 import org.apache.tika.parser.image.ImageParser;
-import org.apache.tika.parser.jpeg.JpegParser;
 import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Test;
@@ -162,6 +162,39 @@ public class TesseractOCRParserTest exte
         } finally {
             stream.close();
         }
+    }
+    
+    @Test
+    public void getNormalMetadataToo() throws Exception {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        assumeTrue(canRun(config));
+
+        Parser parser = new AutoDetectParser();
+        BodyContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(TesseractOCRConfig.class, config);
+        parseContext.set(Parser.class, new TesseractOCRParser());
+
+        InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
+                "/test-documents/testOCR.jpg");
 
+        try {
+            parser.parse(stream, handler, metadata, parseContext);
+            
+            // OCR text
+            assertContains("Apache", handler.toString());
+            assertContains("OCR Testing", handler.toString());
+
+            // Core JPEG properties from JPEGParser should still come through
+            assertEquals("136", metadata.get(Metadata.IMAGE_WIDTH));
+            assertEquals("66", metadata.get(Metadata.IMAGE_LENGTH));
+            assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+            assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
+            assertContains("This is a test Apache Tika imag", metadata.get(Metadata.COMMENTS));
+        } finally {
+            stream.close();
+        }
     }
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.jpg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.jpg?rev=1650050&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.jpg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream