You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/11/28 15:42:00 UTC
tika git commit: TIKA-2169 fix xhtml in ocr
Repository: tika
Updated Branches:
refs/heads/2.x 2f452304b -> a47a69933
TIKA-2169 fix xhtml in ocr
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/a47a6993
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/a47a6993
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/a47a6993
Branch: refs/heads/2.x
Commit: a47a6993375f4105b16c84872a48b327e213084b
Parents: 2f45230
Author: tballison <ta...@mitre.org>
Authored: Mon Nov 28 10:41:53 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Nov 28 10:41:53 2016 -0500
----------------------------------------------------------------------
.../src/test/java/org/apache/tika/TikaTest.java | 13 +++++
.../tika/parser/ocr/TesseractOCRParser.java | 50 ++++++++++++--------
.../tika/parser/ocr/TesseractOCRParserTest.java | 10 ++++
3 files changed, 52 insertions(+), 21 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/a47a6993/tika-core/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 0f6303e..34e9a94 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika;
+import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
@@ -106,6 +107,18 @@ public abstract class TikaTest {
return stream;
}
+ public static void assertContainsCount(String needle, String haystack, int targetCount) {
+ int i = haystack.indexOf(needle);
+ int count = 0;
+ while (i > -1) {
+ count++;
+ i = haystack.indexOf(needle, i+1);
+ }
+ assertEquals("found "+count +" but should have found: "+targetCount,
+ targetCount, count);
+ }
+
+
public static void assertContains(String needle, String haystack) {
assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
}
http://git-wip-us.apache.org/repos/asf/tika/blob/a47a6993/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index a63eae1..0ac2b6b 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -67,7 +67,6 @@ import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.image.ImageParser;
import org.apache.tika.parser.image.TiffParser;
import org.apache.tika.parser.jpeg.JpegParser;
-import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.Attributes;
@@ -220,15 +219,22 @@ public class TesseractOCRParser extends AbstractParser {
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- File tmpImgFile = tmp.createTemporaryFile();
- parse(tikaStream, tmpImgFile, parseContext, xhtml, config);
+ //trigger the spooling to a tmp file if the stream wasn't
+ //already a TikaInputStream that contained a backing file
+ tikaStream.getPath();
+ //this is the text output file name specified on the tesseract
+ //commandline. The actual output file name will have a suffix added.
+ File tmpOCROutputFile = tmp.createTemporaryFile();
+
// Temporary workaround for TIKA-1445 - until we can specify
// composite parsers with strategies (eg Composite, Try In Turn),
// always send the image onwards to the regular parser to have
// the metadata for them extracted as well
- _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new EmbeddedContentHandler(xhtml), metadata, parseContext);
+ _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext);
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ parse(tikaStream, tmpOCROutputFile, parseContext, xhtml, config);
xhtml.endDocument();
} finally {
tmp.dispose();
@@ -264,7 +270,6 @@ public class TesseractOCRParser extends AbstractParser {
* @throws SAXException
* @throws TikaException
*
- * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, ParseContext, TesseractOCRConfig)}
*/
public void parseInline(InputStream stream, XHTMLContentHandler xhtml, ParseContext parseContext,
TesseractOCRConfig config)
@@ -335,7 +340,7 @@ public class TesseractOCRParser extends AbstractParser {
tmp.close();
}
- private void parse(TikaInputStream tikaInputStream, File tmpImgFile, ParseContext parseContext,
+ private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, ParseContext parseContext,
XHTMLContentHandler xhtml, TesseractOCRConfig config)
throws IOException, SAXException, TikaException {
File tmpTxtOutput = null;
@@ -345,21 +350,27 @@ public class TesseractOCRParser extends AbstractParser {
if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
- // copy the contents of the original input file into a temporary file
- // which will be processed for OCR
- TemporaryResources tmp = new TemporaryResources();
- File tmpFile = tmp.createTemporaryFile();
- FileUtils.copyFile(input, tmpFile);
-
// Process image if ImageMagick Tool is present
if(config.isEnableImageProcessing() == 1 && hasImageMagick(config)) {
- processImage(tmpFile,config);
+ // copy the contents of the original input file into a temporary file
+ // which will be preprocessed for OCR
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ File tmpFile = tmp.createTemporaryFile();
+ FileUtils.copyFile(input, tmpFile);
+ processImage(tmpFile, config);
+ doOCR(tmpFile, tmpOCROutputFile, config);
+ } finally {
+ if (tmp != null) {
+ tmp.dispose();
+ }
+ }
+ } else {
+ doOCR(input, tmpOCROutputFile, config);
}
- doOCR(tmpFile, tmpImgFile, config);
-
// Tesseract appends the output type (.txt or .hocr) to output file name
- tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + "." +
+ tmpTxtOutput = new File(tmpOCROutputFile.getAbsolutePath() + "." +
config.getOutputType().toString().toLowerCase(Locale.US));
if (tmpTxtOutput.exists()) {
@@ -371,10 +382,7 @@ public class TesseractOCRParser extends AbstractParser {
}
}
}
-
- tmp.close();
}
-
} finally {
if (tmpTxtOutput != null) {
tmpTxtOutput.delete();
http://git-wip-us.apache.org/repos/asf/tika/blob/a47a6993/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index c0befa1..82414ef 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -197,6 +197,16 @@ public class TesseractOCRParserTest extends TikaTest {
assumeTrue(canRun());
String xml = getXML("testOCR.jpg").xml;
assertContains("OCR Testing", xml);
+ //test metadata extraction
+ assertContains("<meta name=\"Image Width\" content=\"136 pixels\" />", xml);
+
+ //TIKA-2169
+ assertContainsCount("<html", xml, 1);
+ assertContainsCount("<title", xml, 1);
+ assertContainsCount("</title", xml, 1);
+ assertContainsCount("<body", xml, 1);
+ assertContainsCount("</body", xml, 1);
+ assertContainsCount("</html", xml, 1);
}
@Test