You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/19 22:18:24 UTC
[tika] 03/04: Avoid reporting of temporary ocr-based mime type in
xhtml output
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3304
in repository https://gitbox.apache.org/repos/asf/tika.git
commit e5a6039f0285dc92a879f7a4c72b5bc4bdfc236b
Author: tballison <ta...@apache.org>
AuthorDate: Thu Feb 18 17:41:37 2021 -0500
Avoid reporting of temporary ocr-based mime type in xhtml output
---
.../java/org/apache/tika/parser/ocr/TesseractOCRParser.java | 12 ++++++++++++
.../org/apache/tika/parser/ocr/TesseractOCRParserTest.java | 3 ++-
2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 3705f99..9d59d2b 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -26,6 +26,7 @@ import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
@@ -222,6 +223,17 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext)
throws IOException, SAXException, TikaException {
+ //we have to do this so that the temporary ocr-* content type
+ //doesn't show up in the xhtml. We're correctly resetting
+ //it in the AbstractImageParser, but it gets written to xhtml
+ //in the content.
+ String mediaType = metadata.get(Metadata.CONTENT_TYPE);
+ if (mediaType != null) {
+ MediaType mt = MediaType.parse(mediaType);
+ MediaType nonOcr = new MediaType(mt.getType(),
+ mt.getSubtype().replace("ocr-", ""));
+ metadata.set(Metadata.CONTENT_TYPE, nonOcr.toString());
+ }
TesseractOCRConfig userConfig = parseContext.get(TesseractOCRConfig.class);
TesseractOCRConfig config = defaultConfig;
if (userConfig != null) {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index e719e7a..a398a9e 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -163,7 +163,6 @@ public class TesseractOCRParserTest extends TikaTest {
@Test
public void testSingleImage() throws Exception {
Assume.assumeTrue("can run OCR", canRun());
-
String xml = getXML("testOCR.jpg").xml;
assertContains("OCR Testing", xml);
//test metadata extraction
@@ -176,6 +175,8 @@ public class TesseractOCRParserTest extends TikaTest {
assertContainsCount("<body", xml, 1);
assertContainsCount("</body", xml, 1);
assertContainsCount("</html", xml, 1);
+
+ assertNotContained("<meta name=\"Content-Type\" content=\"image/ocr-jpeg\" />", xml);
}