You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/11/28 15:42:00 UTC

tika git commit: TIKA-2169 fix xhtml in ocr

Repository: tika
Updated Branches:
  refs/heads/2.x 2f452304b -> a47a69933


TIKA-2169 fix xhtml in ocr


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/a47a6993
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/a47a6993
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/a47a6993

Branch: refs/heads/2.x
Commit: a47a6993375f4105b16c84872a48b327e213084b
Parents: 2f45230
Author: tballison <ta...@mitre.org>
Authored: Mon Nov 28 10:41:53 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Nov 28 10:41:53 2016 -0500

----------------------------------------------------------------------
 .../src/test/java/org/apache/tika/TikaTest.java | 13 +++++
 .../tika/parser/ocr/TesseractOCRParser.java     | 50 ++++++++++++--------
 .../tika/parser/ocr/TesseractOCRParserTest.java | 10 ++++
 3 files changed, 52 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/a47a6993/tika-core/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 0f6303e..34e9a94 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -16,6 +16,7 @@
  */
 package org.apache.tika;
 
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
@@ -106,6 +107,18 @@ public abstract class TikaTest {
         return stream;
     }
 
+    public static void assertContainsCount(String needle, String haystack, int targetCount) {
+        int i = haystack.indexOf(needle);
+        int count = 0;
+        while (i > -1) {
+            count++;
+            i = haystack.indexOf(needle, i+1);
+        }
+        assertEquals("found "+count +" but should have found: "+targetCount,
+                targetCount, count);
+    }
+
+
     public static void assertContains(String needle, String haystack) {
         assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/a47a6993/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index a63eae1..0ac2b6b 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -67,7 +67,6 @@ import org.apache.tika.parser.external.ExternalParser;
 import org.apache.tika.parser.image.ImageParser;
 import org.apache.tika.parser.image.TiffParser;
 import org.apache.tika.parser.jpeg.JpegParser;
-import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.OfflineContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.Attributes;
@@ -220,15 +219,22 @@ public class TesseractOCRParser extends AbstractParser {
         try {
             TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
 
-            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-            xhtml.startDocument();
-            File tmpImgFile = tmp.createTemporaryFile();
-            parse(tikaStream, tmpImgFile, parseContext, xhtml, config);
+            //trigger the spooling to a tmp file if the stream wasn't
+            //already a TikaInputStream that contained a backing file
+            tikaStream.getPath();
+            //this is the text output file name specified on the tesseract
+            //commandline.  The actual output file name will have a suffix added.
+            File tmpOCROutputFile = tmp.createTemporaryFile();
+
             // Temporary workaround for TIKA-1445 - until we can specify
             //  composite parsers with strategies (eg Composite, Try In Turn),
             //  always send the image onwards to the regular parser to have
             //  the metadata for them extracted as well
-            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new EmbeddedContentHandler(xhtml), metadata, parseContext);
+            _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext);
+
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+            xhtml.startDocument();
+            parse(tikaStream, tmpOCROutputFile, parseContext, xhtml, config);
             xhtml.endDocument();
         } finally {
             tmp.dispose();
@@ -264,7 +270,6 @@ public class TesseractOCRParser extends AbstractParser {
      * @throws SAXException
      * @throws TikaException
      *
-     * @deprecated use {@link #parseInline(InputStream, XHTMLContentHandler, ParseContext, TesseractOCRConfig)}
      */
     public void parseInline(InputStream stream, XHTMLContentHandler xhtml, ParseContext parseContext,
                             TesseractOCRConfig config)
@@ -335,7 +340,7 @@ public class TesseractOCRParser extends AbstractParser {
         tmp.close();
     }
 
-    private void parse(TikaInputStream tikaInputStream, File tmpImgFile, ParseContext parseContext,
+    private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, ParseContext parseContext,
                        XHTMLContentHandler xhtml, TesseractOCRConfig config)
             throws IOException, SAXException, TikaException {
         File tmpTxtOutput = null;
@@ -345,21 +350,27 @@ public class TesseractOCRParser extends AbstractParser {
 
             if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
 
-                // copy the contents of the original input file into a temporary file
-                // which will be processed for OCR
-                TemporaryResources tmp = new TemporaryResources();
-                File tmpFile = tmp.createTemporaryFile();
-                FileUtils.copyFile(input, tmpFile);
-
                 // Process image if ImageMagick Tool is present
                 if(config.isEnableImageProcessing() == 1 && hasImageMagick(config)) {
-                    processImage(tmpFile,config);
+                    // copy the contents of the original input file into a temporary file
+                    // which will be preprocessed for OCR
+                    TemporaryResources tmp = new TemporaryResources();
+                    try {
+                        File tmpFile = tmp.createTemporaryFile();
+                        FileUtils.copyFile(input, tmpFile);
+                        processImage(tmpFile, config);
+                        doOCR(tmpFile, tmpOCROutputFile, config);
+                    } finally {
+                        if (tmp != null) {
+                            tmp.dispose();
+                        }
+                    }
+                } else {
+                    doOCR(input, tmpOCROutputFile, config);
                 }
 
-                doOCR(tmpFile, tmpImgFile, config);
-
                 // Tesseract appends the output type (.txt or .hocr) to output file name
-                tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + "." +
+                tmpTxtOutput = new File(tmpOCROutputFile.getAbsolutePath() + "." +
                         config.getOutputType().toString().toLowerCase(Locale.US));
 
                 if (tmpTxtOutput.exists()) {
@@ -371,10 +382,7 @@ public class TesseractOCRParser extends AbstractParser {
                         }
                     }
                 }
-
-                tmp.close();
             }
-
         } finally {
             if (tmpTxtOutput != null) {
                 tmpTxtOutput.delete();

http://git-wip-us.apache.org/repos/asf/tika/blob/a47a6993/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index c0befa1..82414ef 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -197,6 +197,16 @@ public class TesseractOCRParserTest extends TikaTest {
         assumeTrue(canRun());
         String xml = getXML("testOCR.jpg").xml;
         assertContains("OCR Testing", xml);
+        //test metadata extraction
+        assertContains("<meta name=\"Image Width\" content=\"136 pixels\" />", xml);
+
+        //TIKA-2169
+        assertContainsCount("<html", xml, 1);
+        assertContainsCount("<title", xml, 1);
+        assertContainsCount("</title", xml, 1);
+        assertContainsCount("<body", xml, 1);
+        assertContainsCount("</body", xml, 1);
+        assertContainsCount("</html", xml, 1);
     }
 
     @Test