You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/01/07 13:30:17 UTC
svn commit: r1650050 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
test/resources/test-documents/testOCR.jpg
Author: nick
Date: Wed Jan 7 12:30:17 2015
New Revision: 1650050
URL: http://svn.apache.org/r1650050
Log:
TIKA-1445 Unit test to check a JPEG via Tesseract gets both OCR text and normal JPEG metadata
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.jpg (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java?rev=1650050&r1=1650049&r2=1650050&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java Wed Jan 7 12:30:17 2015
@@ -192,6 +192,7 @@ public class TesseractOCRParser extends
// TIKA-1445 workaround parser
private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
private static class CompositeImageParser extends CompositeParser {
+ private static final long serialVersionUID = -2398203346206381382L;
private static List<Parser> imageParsers = Arrays.asList(new Parser[]{
new ImageParser(), new JpegParser(), new TiffParser()
});
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java?rev=1650050&r1=1650049&r2=1650050&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java Wed Jan 7 12:30:17 2015
@@ -25,6 +25,7 @@ import java.io.InputStream;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DefaultParser;
@@ -32,7 +33,6 @@ import org.apache.tika.parser.ParseConte
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.image.ImageParser;
-import org.apache.tika.parser.jpeg.JpegParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
@@ -162,6 +162,39 @@ public class TesseractOCRParserTest exte
} finally {
stream.close();
}
+ }
+
+ @Test
+ public void getNormalMetadataToo() throws Exception {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ assumeTrue(canRun(config));
+
+ Parser parser = new AutoDetectParser();
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, config);
+ parseContext.set(Parser.class, new TesseractOCRParser());
+
+ InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
+ "/test-documents/testOCR.jpg");
+ try {
+ parser.parse(stream, handler, metadata, parseContext);
+
+ // OCR text
+ assertContains("Apache", handler.toString());
+ assertContains("OCR Testing", handler.toString());
+
+ // Core JPEG properties from JPEGParser should still come through
+ assertEquals("136", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("66", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+ assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
+ assertContains("This is a test Apache Tika imag", metadata.get(Metadata.COMMENTS));
+ } finally {
+ stream.close();
+ }
}
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.jpg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.jpg?rev=1650050&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testOCR.jpg
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream