You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2016/10/05 11:15:35 UTC
tika git commit: Tesseract may see the t in haystack as a ! some
times...
Repository: tika
Updated Branches:
refs/heads/2.x 1ab6c81ce -> 1ec8c0947
Tesseract may see the t in haystack as a ! some times...
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/1ec8c094
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/1ec8c094
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/1ec8c094
Branch: refs/heads/2.x
Commit: 1ec8c0947575729975601d543f9a5b08ca3c7269
Parents: 1ab6c81
Author: Nick Burch <ni...@gagravarr.org>
Authored: Wed Jun 22 09:33:41 2016 +0100
Committer: Nick Burch <ni...@gagravarr.org>
Committed: Wed Oct 5 12:08:25 2016 +0100
----------------------------------------------------------------------
.../test/java/org/apache/tika/parser/pdf/PDFParserTest.java | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/1ec8c094/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ff74e50..e99e87b 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1188,7 +1188,13 @@ public class PDFParserTest extends TikaTest {
assertContains("Haystack", xmlResult.xml);
assertContains("Needle", xmlResult.xml);
if (! strategy.equals(PDFParserConfig.OCR_STRATEGY.NO_OCR)) {
- assertContains("<div class=\"ocr\">pdf_haystack", xmlResult.xml);
+ // Tesseract may see the t in haystack as a ! some times...
+ String div = "<div class=\"ocr\">pdf_hays";
+ if (xmlResult.xml.contains(div+"!ack")) {
+ assertContains(div+"!ack", xmlResult.xml);
+ } else {
+ assertContains(div+"tack", xmlResult.xml);
+ }
} else {
assertNotContained("<div class=\"ocr\">pdf_haystack", xmlResult.xml);
}