You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2016/10/05 11:15:35 UTC

tika git commit: Tesseract may see the t in haystack as a ! some times...

Repository: tika
Updated Branches:
  refs/heads/2.x 1ab6c81ce -> 1ec8c0947


Tesseract may see the t in haystack as a ! some times...


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/1ec8c094
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/1ec8c094
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/1ec8c094

Branch: refs/heads/2.x
Commit: 1ec8c0947575729975601d543f9a5b08ca3c7269
Parents: 1ab6c81
Author: Nick Burch <ni...@gagravarr.org>
Authored: Wed Jun 22 09:33:41 2016 +0100
Committer: Nick Burch <ni...@gagravarr.org>
Committed: Wed Oct 5 12:08:25 2016 +0100

----------------------------------------------------------------------
 .../test/java/org/apache/tika/parser/pdf/PDFParserTest.java  | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/1ec8c094/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ff74e50..e99e87b 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1188,7 +1188,13 @@ public class PDFParserTest extends TikaTest {
             assertContains("Haystack", xmlResult.xml);
             assertContains("Needle", xmlResult.xml);
             if (! strategy.equals(PDFParserConfig.OCR_STRATEGY.NO_OCR)) {
-                assertContains("<div class=\"ocr\">pdf_haystack", xmlResult.xml);
+                // Tesseract may see the t in haystack as a ! some times...
+                String div = "<div class=\"ocr\">pdf_hays";
+                if (xmlResult.xml.contains(div+"!ack")) {
+                   assertContains(div+"!ack", xmlResult.xml);
+                } else {
+                   assertContains(div+"tack", xmlResult.xml);
+                }
             } else {
                 assertNotContained("<div class=\"ocr\">pdf_haystack", xmlResult.xml);
             }