You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by th...@apache.org on 2014/10/21 11:47:55 UTC
svn commit: r1633331 - in
/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser:
mail/RFC822ParserTest.java ocr/TesseractOCRParserTest.java
ocr/TesseractOCRTest.java
Author: thaichat04
Date: Tue Oct 21 09:47:54 2014
New Revision: 1633331
URL: http://svn.apache.org/r1633331
Log:
TIKA-1422 - Fixing build & minor refactory of naming test class
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java (with props)
Removed:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRTest.java
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1633331&r1=1633330&r2=1633331&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Tue Oct 21 09:47:54 2014
@@ -36,7 +36,6 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -87,8 +86,7 @@ public class RFC822ParserTest {
verify(handler).startDocument();
int bodyExpectedTimes = 4, multipackExpectedTimes = 5;;
int invokingTimes = bodyExpectedTimes;
- TesseractOCRConfig config = new TesseractOCRConfig();
- if (TesseractOCRParserTest.canRun(config)) {
+ if (TesseractOCRParserTest.canRun()) {
invokingTimes = multipackExpectedTimes;
}
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java?rev=1633331&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java Tue Oct 21 09:47:54 2014
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+
+import java.io.InputStream;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assume.assumeTrue;
+
+public class TesseractOCRParserTest extends TikaTest {
+
+ public static boolean canRun() {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest();
+ return tesseractOCRTest.canRun(config);
+ }
+
+ private boolean canRun(TesseractOCRConfig config) {
+ String[] checkCmd = {config.getTesseractPath() + "tesseract"};
+ // If Tesseract is not on the path, do not run the test.
+ return ExternalParser.check(checkCmd);
+ }
+
+ @Test
+ public void testPDFOCR() throws Exception {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ assumeTrue(canRun(config));
+
+ Parser parser = new AutoDetectParser();
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ PDFParserConfig pdfConfig = new PDFParserConfig();
+ pdfConfig.setExtractInlineImages(true);
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, config);
+ parseContext.set(Parser.class, new TesseractOCRParser());
+ parseContext.set(PDFParserConfig.class, pdfConfig);
+
+ InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
+ "/test-documents/testOCR.pdf");
+
+ try {
+ parser.parse(stream, handler, metadata, parseContext);
+ assertTrue(handler.toString().contains("Happy New Year 2003!"));
+ } finally {
+ stream.close();
+ }
+ }
+
+ @Test
+ public void testDOCXOCR() throws Exception {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ assumeTrue(canRun(config));
+
+ Parser parser = new AutoDetectParser();
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, config);
+ parseContext.set(Parser.class, new TesseractOCRParser());
+
+ InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
+ "/test-documents/testOCR.docx");
+
+ try {
+ parser.parse(stream, handler, metadata, parseContext);
+
+ assertTrue(handler.toString().contains("Happy New Year 2003!"));
+ assertTrue(handler.toString().contains("This is some text."));
+ assertTrue(handler.toString().contains("Here is an embedded image:"));
+ } finally {
+ stream.close();
+ }
+ }
+
+ @Test
+ public void testPPTXOCR() throws Exception {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ assumeTrue(canRun(config));
+
+ Parser parser = new AutoDetectParser();
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, config);
+ parseContext.set(Parser.class, new TesseractOCRParser());
+
+ InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(
+ "/test-documents/testOCR.pptx");
+
+ try {
+ parser.parse(stream, handler, metadata, parseContext);
+
+ assertTrue("Check for the image's text.", handler.toString().contains("Happy New Year 2003!"));
+ assertTrue("Check for the standard text.", handler.toString().contains("This is some text"));
+ } finally {
+ stream.close();
+ }
+
+ }
+}
Propchange: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
------------------------------------------------------------------------------
svn:eol-style = native