You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/06/12 15:13:37 UTC
[tika] branch main updated: TIKA-4043 -- fix build issues related to timezone differences and variations of output from Tesseract (#1187)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 32b8cbcf9 TIKA-4043 -- fix build issues related to timezone differences and variations of output from Tesseract (#1187)
32b8cbcf9 is described below
commit 32b8cbcf9e900690e67c832b635f36fc91d2330f
Author: Tim Allison <ta...@apache.org>
AuthorDate: Mon Jun 12 11:13:29 2023 -0400
TIKA-4043 -- fix build issues related to timezone differences and variations of output from Tesseract (#1187)
---
.../test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java | 4 +++-
.../test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java | 4 ++++
2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index 1fd53fe34..b00ec300d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -392,7 +392,9 @@ public class RTFParserTest extends TikaTest {
assertEquals("1", xml.metadata.get(Office.PAGE_COUNT));
assertEquals("70", xml.metadata.get(Office.WORD_COUNT));
assertEquals("401", xml.metadata.get(Office.CHARACTER_COUNT));
- assertTrue(xml.metadata.get(TikaCoreProperties.CREATED).startsWith("2010-10-13T"));
+ //RTFParser's legacy behavior is to apply local timezone to dates/times.
+ //This needs to be flexible enough to pass in various time-zones TIKA-4043
+ assertTrue(xml.metadata.get(TikaCoreProperties.CREATED).startsWith("2010-10-"));
}
// TIKA-1192
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 776e3dddb..b2c1cb262 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -105,6 +105,8 @@ public class TesseractOCRParserTest extends TikaTest {
//tesseract should handle multipage tiffs by itself
//let's confirm that
String xml = getXML("testTIFF_multipage.tif", getMetadata(MediaType.image("tiff"))).xml;
+ //TIKA-4043 -- on some OS/versions of tesseract Page?2 is extracted
+ xml = xml.replaceAll("[^A-Za-z0-9]", " ");
assertContains("Page 2", xml);
}
@@ -117,6 +119,8 @@ public class TesseractOCRParserTest extends TikaTest {
context.set(TesseractOCRConfig.class, config);
String xml =
getXML("testTIFF_multipage.tif", getMetadata(MediaType.image("tiff")), context).xml;
+ //TIKA-4043 -- on some OS/versions of tesseract Page?2 is extracted
+ xml = xml.replaceAll("[^A-Za-z0-9]", " ");
assertNotContained("Page 2", xml);
}